In [1]:
import csv
import random
from collections import namedtuple
from itertools import groupby, islice
from datetime import date, datetime, timedelta
from dateutil import tz
from statistics import mean
from textblob.classifiers import NaiveBayesClassifier
In [2]:
pesterbot_csv_name = 'sam.csv'
moodscope_csv_name = 'moodscope.csv'
skiplines = 31 # determined by manual inspection
diary_name = 'samdiary.txt'

Pesterbot Data

In [3]:
Message = namedtuple('Message', ['datetime', 'text'])

first, get all the lines from the files and convert the timestamps to proper datetimes

In [42]:
def load_messages_from_pesterbot_csv(pesterbot_csv_name):
    with open('sam.csv', newline='') as pesterbotcsv:
        reader = csv.reader(pesterbotcsv)
        def line_to_message(line):
            timestamp, text = line
            timestamp = int(timestamp)
            utc_dt = datetime.utcfromtimestamp(timestamp).replace(tzinfo=tz.gettz('UTC'))
            ac_dt = utc_dt.astimezone(tz.gettz('US/Central'))
            return Message(ac_dt, text)
        messages = [ line_to_message(line) for line in reader ]
        return messages

messages = load_messages_from_pesterbot_csv(pesterbot_csv_name)
messages[:2]
Out[42]:
[Message(datetime=datetime.datetime(2016, 6, 30, 2, 47, 10, tzinfo=tzfile('/usr/share/zoneinfo/US/Central')), text='Drawing Spalding'),
 Message(datetime=datetime.datetime(2016, 6, 30, 11, 15, 50, tzinfo=tzfile('/usr/share/zoneinfo/US/Central')), text='Walking to work')]

people usually finish their days past midnight. We're going to choose 6 am CST (4 am PST, 7 am EST) as the 'end of a day'. We're also going to introduce the the idea of a human_day: from 6 am CST to the next 6 am CST.

In [5]:
def datetime_to_human_day(dt):
    return (dt - timedelta(hours=6)).date()

datetime_to_human_day(datetime(2000, 1, 1, 1, 0, 0))
Out[5]:
datetime.date(1999, 12, 31)

1AM of a 'real day' is now converted to the previous human_day.

for this analysis we only care about the day a message occured on. So we can group by human day, throw away the extra datetime information, and concat all the messages together (an entry).

In [41]:
def group_entry_by_human_days(messages):
    messages = sorted(messages, key=lambda message: message.datetime)
    keyfunc = lambda message: datetime_to_human_day(message.datetime)
    def messages_to_entry(messages):
        return '; '.join([ message.text for message in messages ])
    return { human_day: messages_to_entry(grouped_messages)
            for human_day, grouped_messages
            in groupby(messages, key=keyfunc) }

human_day_to_entry = group_entry_by_human_days(messages)
human_day_to_entry[date(2016, 10, 25)][:55] + '...'
Out[41]:
'Up and out; Walking to class doing Chinese flashcards; ...'

we can optionally to write this as a diary

In [7]:
def write_diary(diary_name, human_day_to_entry):
    with open(diary_name, 'w') as outfile:
        outfile.write('\n'.join(
            [ f'{human_day}: {entry}' for human_day, entry in human_day_to_entry.items() ]
        ))

#write_diary(diary_name, human_day_to_entry)

Moodscope Data

In [8]:
Score = namedtuple('Score', ['date', 'score'])
In [43]:
def load_scores_from_moodscope_csv(moodscope_csv_name, skiplines=1):
    '''
    `skiplines` is an optional parameter that allows for skipping the first `skiplines` lines
    included because sometimes beginning moodscope data is suspect
    '''
    with open(moodscope_csv_name, newline='') as moodscope_csv:
        reader = csv.reader(moodscope_csv)
        def line_to_score(line):
            datestr, percent = line
            d = datetime.strptime(datestr, '%Y-%m-%d').date()
            score = int(percent[:-1])
            return Score(d, score)
        return [ line_to_score(line) for line in islice(reader, skiplines, None) ]
scores = load_scores_from_moodscope_csv(moodscope_csv_name, skiplines=skiplines)
scores[:2]
Out[43]:
[Score(date=datetime.date(2017, 5, 29), score=79),
 Score(date=datetime.date(2017, 5, 30), score=64)]

Correlation

In [10]:
# for kicks, let's take a look
avgscore = mean([ score.score for score in scores ])
avgscore
Out[10]:
65.24122807017544
In [11]:
def classify_entries(scores, human_day_to_entry, classification=None, date_transform=None):
    '''
    assigns a label of 'pos' or 'neg' to a pesterbot entry, based on a corresponding moodscope score
    `classification` is a function that maps scores to 'pos' or 'neg' and (optionally) 'skip' categories
    `date_transform` is a function that transforms the date that is used to index into the data map
        this is useful if you want to map from the text of the previous (or next) day (or week, year, etc)
        to mood of a particular day   
    '''
    # default date_transform is identity
    date_transform = date_transform if date_transform else (lambda day: day)
    # default classificaton is 'pos' if above average, 'neg' if below
    classification = classification if classification else (lambda score: 'pos' if score >= avgscore else 'neg')
    day_to_entry = lambda day: human_day_to_entry[date_transform(day)]
    classified_entries = [ (day_to_entry(day), classification(score)) for day, score in scores ]
    return [ (entry, classification) for entry, classification in classified_entries
            if classification is not 'skip' ]
In [12]:
def split_data_train_test(data, split_percentage=.8):
    random.shuffle(data)
    splitpos = int(len(data) * split_percentage)
    train = data[:splitpos]
    test = data[splitpos:]
    return train, test
In [13]:
def train_classifier(scores, human_day_to_entry, classification=None, date_transform=None):
    classified_entries = classify_entries(
        scores,
        human_day_to_entry_lower,
        classification=classification,
        date_transform=date_transform)
    train, test = split_data_train_test(classified_entries)
    classifier = NaiveBayesClassifier(train)
    return (classifier, classifier.accuracy(test))

first we're going to get rid of capitalized words, cause those are annoying in analysis

In [14]:
human_day_to_entry_lower = { human_day: entry.lower() for human_day, entry in human_day_to_entry.items() }

Previous Day Associations

if we shift the day back one, we can see how the previous day's entry affected a day's moodscope score

In [45]:
previous_day= lambda day: day - timedelta(days=1)
classifier_previous_day, accuracy = train_classifier(
    scores, human_day_to_entry_lower, date_transform=previous_day)
accuracy
Out[45]:
0.6304347826086957

oof, accuracy is not great 😅

In [46]:
classifier_previous_day.show_informative_features(10)
Most Informative Features
           contains(big) = True              pos : neg    =      7.0 : 1.0
        contains(jiawei) = True              pos : neg    =      6.7 : 1.0
          contains(chai) = True              neg : pos    =      6.6 : 1.0
          contains(knew) = True              pos : neg    =      6.0 : 1.0
         contains(steam) = True              pos : neg    =      6.0 : 1.0
           contains(hip) = True              pos : neg    =      6.0 : 1.0
            contains(😂😂) = True              pos : neg    =      5.7 : 1.0
      contains(wandered) = True              neg : pos    =      5.4 : 1.0
        contains(bridge) = True              neg : pos    =      5.4 : 1.0
         contains(draft) = True              neg : pos    =      5.4 : 1.0

here are the goodies! this table corresponds to which features are most closely associated with positive and negative classifications

for example, on my run the first line is contains(big) = True pos : neg = 7.8 : 1.0

this means that if, on some certain day, I use the word 'big' in pesterbot, then, in this case, I am more likely to have an above average rating on my moodscope the next day (don't ask me what the 7.8 means, I don't know)

fascinating!

*note: running this multiple times will, of course, generate different words, since train data will be randomized

Same Day Associations

let's see what happens if we don't shift it back a day (having a good/bad score will cause the use of certain words??)

In [17]:
classifier_same_day, accuracy = train_classifier(scores, human_day_to_entry_lower)
accuracy
Out[17]:
0.5434782608695652
In [18]:
classifier_same_day.show_informative_features(10)
Most Informative Features
       contains(earlier) = True              pos : neg    =      7.6 : 1.0
        contains(filled) = True              pos : neg    =      5.6 : 1.0
          contains(shop) = True              pos : neg    =      5.6 : 1.0
       contains(control) = True              neg : pos    =      5.1 : 1.0
          contains(grab) = True              neg : pos    =      5.1 : 1.0
        contains(career) = True              neg : pos    =      5.1 : 1.0
     contains(reviewing) = True              neg : pos    =      5.1 : 1.0
         contains(queue) = True              pos : neg    =      4.9 : 1.0
         contains(print) = True              pos : neg    =      4.9 : 1.0
     contains(confusing) = True              pos : neg    =      4.9 : 1.0

here, for me, the first entry is contains(earlier) = True pos : neg = 7.6 : 1.0, which could mean that on a day in which I have an above average moodscope rating I am more likely to use the word 'earlier' (perhaps I set out earlier when I am in a good mood ??)

Stricter Classification Criteria

if a day is just one point below average that doesn't really make it a bad day does it...? we can look at which words are associated with the top and bottom quartile of days

In [19]:
def get_quartiles(data):
    quarter = int(len(data) / 4)
    start, bottom_quartile, middle, top_quartile = sorted(data)[0::quarter]
    return (bottom_quartile, top_quartile)
bottom_quartile, top_quartile = get_quartiles([ score.score for score in scores ])
bottom_quartile, top_quartile
Out[19]:
(62, 71)
In [20]:
def quartile_classification(score):
    if score >= top_quartile:
        return 'pos'
    if score <= bottom_quartile:
        return 'neg'
    return 'skip'
quartile_classification(58), quartile_classification(68), quartile_classification(78)
Out[20]:
('neg', 'skip', 'pos')

Previous Day

In [55]:
classifier_previous_day_quartiles, accuracy = train_classifier(
    scores, human_day_to_entry_lower, date_transform=previous_day)
accuracy
Out[55]:
0.6086956521739131
In [56]:
classifier_previous_day_quartiles.show_informative_features(10)
Most Informative Features
          contains(days) = True              pos : neg    =      7.3 : 1.0
          contains(visa) = True              pos : neg    =      6.6 : 1.0
        contains(jiawei) = True              pos : neg    =      6.6 : 1.0
        contains(laying) = True              pos : neg    =      5.9 : 1.0
           contains(wtf) = True              pos : neg    =      5.9 : 1.0
           contains(hip) = True              pos : neg    =      5.9 : 1.0
           contains(big) = True              pos : neg    =      5.6 : 1.0
      contains(possible) = True              pos : neg    =      5.6 : 1.0
           contains(aoc) = True              neg : pos    =      5.4 : 1.0
            contains(😂😂) = True              pos : neg    =      5.2 : 1.0

contains(big) = True pos : neg = 5.6 : 1.0, 'big' is really a good word for me huh 😂

Same Day

In [23]:
classifier_same_day_quartiles, accuracy = train_classifier(
    scores, human_day_to_entry_lower, date_transform=previous_day)
accuracy
Out[23]:
0.5760869565217391
In [24]:
classifier_same_day_quartiles.show_informative_features(10)
Most Informative Features
          contains(meet) = True              neg : pos    =      9.1 : 1.0
          contains(chai) = True              neg : pos    =      7.1 : 1.0
           contains(bbb) = True              pos : neg    =      6.9 : 1.0
          contains(rain) = True              pos : neg    =      6.3 : 1.0
        contains(target) = True              pos : neg    =      6.3 : 1.0
        contains(jiawei) = True              pos : neg    =      6.3 : 1.0
           contains(hip) = True              pos : neg    =      6.3 : 1.0
           contains(big) = True              pos : neg    =      6.1 : 1.0
          contains(mins) = True              neg : pos    =      5.7 : 1.0
         contains(draft) = True              neg : pos    =      5.7 : 1.0

contains(big) = True pos : neg = 6.1 : 1.0

my triumph of two years of data collection! the word 'big' is heavily associated with a good mood!!

in all seriousness though there is a large caveat to be placed on all of this: my moodscope data is pretty heavily correlated with my sleep 😴 if I don't get enough sleep that usually drops my score way down, and if I sleep well it brings it up

so in some sense it might be useful to think of this analysis as "what words are associated with good sleep"