import csv
import random
from collections import namedtuple
from itertools import groupby, islice
from datetime import date, datetime, timedelta
from dateutil import tz
from statistics import mean
from textblob.classifiers import NaiveBayesClassifier
pesterbot_csv_name = 'sam.csv'
moodscope_csv_name = 'moodscope.csv'
skiplines = 31 # determined by manual inspection
diary_name = 'samdiary.txt'
Message = namedtuple('Message', ['datetime', 'text'])
first, get all the lines from the files and convert the timestamps to proper datetimes
def load_messages_from_pesterbot_csv(pesterbot_csv_name):
with open('sam.csv', newline='') as pesterbotcsv:
reader = csv.reader(pesterbotcsv)
def line_to_message(line):
timestamp, text = line
timestamp = int(timestamp)
utc_dt = datetime.utcfromtimestamp(timestamp).replace(tzinfo=tz.gettz('UTC'))
ac_dt = utc_dt.astimezone(tz.gettz('US/Central'))
return Message(ac_dt, text)
messages = [ line_to_message(line) for line in reader ]
return messages
messages = load_messages_from_pesterbot_csv(pesterbot_csv_name)
messages[:2]
people usually finish their days past midnight. We're going to choose 6 am CST (4 am PST, 7 am EST) as the 'end of a day'. We're also going to introduce the the idea of a human_day
: from 6 am CST to the next 6 am CST.
def datetime_to_human_day(dt):
return (dt - timedelta(hours=6)).date()
datetime_to_human_day(datetime(2000, 1, 1, 1, 0, 0))
1AM of a 'real day' is now converted to the previous human_day
.
for this analysis we only care about the day a message occured on. So we can group by human day, throw away the extra datetime information, and concat all the messages together (an entry
).
def group_entry_by_human_days(messages):
messages = sorted(messages, key=lambda message: message.datetime)
keyfunc = lambda message: datetime_to_human_day(message.datetime)
def messages_to_entry(messages):
return '; '.join([ message.text for message in messages ])
return { human_day: messages_to_entry(grouped_messages)
for human_day, grouped_messages
in groupby(messages, key=keyfunc) }
human_day_to_entry = group_entry_by_human_days(messages)
human_day_to_entry[date(2016, 10, 25)][:55] + '...'
we can optionally to write this as a diary
def write_diary(diary_name, human_day_to_entry):
with open(diary_name, 'w') as outfile:
outfile.write('\n'.join(
[ f'{human_day}: {entry}' for human_day, entry in human_day_to_entry.items() ]
))
#write_diary(diary_name, human_day_to_entry)
Score = namedtuple('Score', ['date', 'score'])
def load_scores_from_moodscope_csv(moodscope_csv_name, skiplines=1):
'''
`skiplines` is an optional parameter that allows for skipping the first `skiplines` lines
included because sometimes beginning moodscope data is suspect
'''
with open(moodscope_csv_name, newline='') as moodscope_csv:
reader = csv.reader(moodscope_csv)
def line_to_score(line):
datestr, percent = line
d = datetime.strptime(datestr, '%Y-%m-%d').date()
score = int(percent[:-1])
return Score(d, score)
return [ line_to_score(line) for line in islice(reader, skiplines, None) ]
scores = load_scores_from_moodscope_csv(moodscope_csv_name, skiplines=skiplines)
scores[:2]
# for kicks, let's take a look
avgscore = mean([ score.score for score in scores ])
avgscore
def classify_entries(scores, human_day_to_entry, classification=None, date_transform=None):
'''
assigns a label of 'pos' or 'neg' to a pesterbot entry, based on a corresponding moodscope score
`classification` is a function that maps scores to 'pos' or 'neg' and (optionally) 'skip' categories
`date_transform` is a function that transforms the date that is used to index into the data map
this is useful if you want to map from the text of the previous (or next) day (or week, year, etc)
to mood of a particular day
'''
# default date_transform is identity
date_transform = date_transform if date_transform else (lambda day: day)
# default classificaton is 'pos' if above average, 'neg' if below
classification = classification if classification else (lambda score: 'pos' if score >= avgscore else 'neg')
day_to_entry = lambda day: human_day_to_entry[date_transform(day)]
classified_entries = [ (day_to_entry(day), classification(score)) for day, score in scores ]
return [ (entry, classification) for entry, classification in classified_entries
if classification is not 'skip' ]
def split_data_train_test(data, split_percentage=.8):
random.shuffle(data)
splitpos = int(len(data) * split_percentage)
train = data[:splitpos]
test = data[splitpos:]
return train, test
def train_classifier(scores, human_day_to_entry, classification=None, date_transform=None):
classified_entries = classify_entries(
scores,
human_day_to_entry_lower,
classification=classification,
date_transform=date_transform)
train, test = split_data_train_test(classified_entries)
classifier = NaiveBayesClassifier(train)
return (classifier, classifier.accuracy(test))
first we're going to get rid of capitalized words, cause those are annoying in analysis
human_day_to_entry_lower = { human_day: entry.lower() for human_day, entry in human_day_to_entry.items() }
if we shift the day back one, we can see how the previous day's entry affected a day's moodscope score
previous_day= lambda day: day - timedelta(days=1)
classifier_previous_day, accuracy = train_classifier(
scores, human_day_to_entry_lower, date_transform=previous_day)
accuracy
oof, accuracy is not great 😅
classifier_previous_day.show_informative_features(10)
here are the goodies! this table corresponds to which features are most closely associated with positive and negative classifications
for example, on my run the first line is contains(big) = True pos : neg = 7.8 : 1.0
this means that if, on some certain day, I use the word 'big' in pesterbot, then, in this case, I am more likely to have an above average rating on my moodscope the next day (don't ask me what the 7.8 means, I don't know)
fascinating!
*note: running this multiple times will, of course, generate different words, since train data will be randomized
let's see what happens if we don't shift it back a day (having a good/bad score will cause the use of certain words??)
classifier_same_day, accuracy = train_classifier(scores, human_day_to_entry_lower)
accuracy
classifier_same_day.show_informative_features(10)
here, for me, the first entry is contains(earlier) = True pos : neg = 7.6 : 1.0
, which could mean that on a day in which I have an above average moodscope rating I am more likely to use the word 'earlier' (perhaps I set out earlier when I am in a good mood ??)
if a day is just one point below average that doesn't really make it a bad day does it...? we can look at which words are associated with the top and bottom quartile of days
def get_quartiles(data):
quarter = int(len(data) / 4)
start, bottom_quartile, middle, top_quartile = sorted(data)[0::quarter]
return (bottom_quartile, top_quartile)
bottom_quartile, top_quartile = get_quartiles([ score.score for score in scores ])
bottom_quartile, top_quartile
def quartile_classification(score):
if score >= top_quartile:
return 'pos'
if score <= bottom_quartile:
return 'neg'
return 'skip'
quartile_classification(58), quartile_classification(68), quartile_classification(78)
classifier_previous_day_quartiles, accuracy = train_classifier(
scores, human_day_to_entry_lower, date_transform=previous_day)
accuracy
classifier_previous_day_quartiles.show_informative_features(10)
contains(big) = True pos : neg = 5.6 : 1.0
, 'big' is really a good word for me huh 😂
classifier_same_day_quartiles, accuracy = train_classifier(
scores, human_day_to_entry_lower, date_transform=previous_day)
accuracy
classifier_same_day_quartiles.show_informative_features(10)
contains(big) = True pos : neg = 6.1 : 1.0
my triumph of two years of data collection! the word 'big' is heavily associated with a good mood!!
in all seriousness though there is a large caveat to be placed on all of this: my moodscope data is pretty heavily correlated with my sleep 😴 if I don't get enough sleep that usually drops my score way down, and if I sleep well it brings it up
so in some sense it might be useful to think of this analysis as "what words are associated with good sleep"