LDA

In [49]:
from sklearn.feature_extraction.text import CountVectorizer ## LDA is a probability distribution over words, so we need the word count

First make the Count Vectorizer. We want to word counts.

In [113]:
## Here we are tokening out text
num_features = 1000 # you can change this to which ever value you think is right. 
tf_vectorizer = CountVectorizer(max_df=500, min_df=10, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(text_data) # documents is a Data Frame
tf_feature_names = tf_vectorizer.get_feature_names()
In [55]:
from sklearn.decomposition import LatentDirichletAllocation ### Importing the LDA library

Getting te news group data set. You can also load the data from csv files.

In [99]:
from sklearn.datasets import fetch_20newsgroups
In [100]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
n_samples = 2000
data_samples = dataset.data[:n_samples]
In [85]:
type(data_samples)
Out[85]:
list
In [115]:
tf_data_samples = tf_vectorizer.fit_transform(data_samples) # Tokenizing and getting the feature counts. 
tf_feature_names = tf_vectorizer.get_feature_names()
In [116]:
tf_data_samples
Out[116]:
<2000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 52084 stored elements in Compressed Sparse Row format>
In [103]:
num_topics = 50 # Tune this parameter. Also consider tuning the parameters max_iter, learning_offset, and random_state. 
#num_topics2 = 100
#num_topics3 = 150
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=100, learning_method='online', learning_offset=50.,random_state=1).fit(tf_data_samples)
#lda2 = LatentDirichletAllocation(n_components=num_topics2, max_iter=100, learning_method='online', learning_offset=50.,random_state=1).fit(tf_data_samples)
#lda3 = LatentDirichletAllocation(n_components=num_topics3, max_iter=100, learning_method='online', learning_offset=50.,random_state=1).fit(tf_data_samples)
In [104]:
lda.score(tf_data_samples) # Log-likelihood score. Higher the better (magnitude). This basically mean, given the parameters we have chosen, how best are we able to explain the data. 
Out[104]:
-532312.5814516542
In [117]:
def print_topics(model, vectorizer, top_n=5):    #### This is the code to print the topics. top_n can be changed. 
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
In [105]:
print_topics(lda,tf_vectorizer) ### Printing all the topics, and the top 5 words in them. 
Topic 0:
[('500', 55.346394059110835), ('offer', 45.111335983135234), ('printer', 42.735685424514166), ('hp', 38.928386233400722), ('300', 37.870188907101721)]
Topic 1:
[('contact', 0.020000010686429252), ('don', 0.020000007167755556), ('like', 0.02000000696823322), ('effect', 0.020000006231511294), ('bike', 0.020000005863177801)]
Topic 2:
[('problem', 244.7174847020267), ('mac', 48.974769254866452), ('make', 31.216831830039926), ('drivers', 27.746354872140536), ('case', 25.627149769343454)]
Topic 3:
[('speed', 0.020000007696311502), ('good', 0.020000005964905372), ('point', 0.020000005498951132), ('performance', 0.020000005289097844), ('life', 0.020000004223411329)]
Topic 4:
[('taking', 0.020000002425073906), ('leave', 0.020000002262644156), ('space', 0.020000002221044776), ('fit', 0.020000002201977864), ('assume', 0.020000002188489699)]
Topic 5:
[('card', 151.46960803416553), ('data', 121.1667197170472), ('speed', 70.910350336780041), ('memory', 65.407688992582109), ('video', 48.485961947559538)]
Topic 6:
[('win', 94.893405253258322), ('second', 94.034553655808921), ('great', 70.123172957162382), ('text', 38.120710956735572), ('runs', 33.333985474621073)]
Topic 7:
[('game', 141.59700990095152), ('team', 125.7765764772705), ('play', 116.05990348687729), ('games', 61.748195876819736), ('season', 60.452778919455191)]
Topic 8:
[('100', 59.324672146591332), ('radio', 48.362158555066848), ('cable', 35.365544702758356), ('listen', 19.903706897725307), ('miles', 8.2124423254124288)]
Topic 9:
[('bike', 71.245072996050197), ('used', 62.967104494776962), ('price', 56.390181038380732), ('water', 54.551315270898506), ('light', 40.058411112861592)]
Topic 10:
[('book', 99.592905477910918), ('reply', 54.850392463424946), ('email', 50.4575466451765), ('reference', 49.286152426920552), ('looking', 46.216181808125633)]
Topic 11:
[('possible', 0.020000002376723367), ('cut', 0.020000002364609297), ('death', 0.020000002215236696), ('believe', 0.020000002189067657), ('western', 0.020000002187267267)]
Topic 12:
[('edu', 0.020000006846874172), ('graphics', 0.020000005423784332), ('128', 0.020000004862588197), ('actually', 0.020000004552850514), ('pub', 0.020000004535546373)]
Topic 13:
[('greek', 61.325693770947211), ('does', 53.106400866508935), ('love', 39.64199060273608), ('thought', 38.627604825429337), ('house', 32.085570831517529)]
Topic 14:
[('00', 103.31912080863431), ('basically', 44.298708836224669), ('earth', 10.065010417743274), ('way', 9.6046817573410834), ('make', 4.5151460180570568)]
Topic 15:
[('point', 90.125745406557314), ('line', 73.480417156885352), ('level', 52.697400391118904), ('instead', 46.14144707912277), ('signal', 42.95475432772205)]
Topic 16:
[('know', 0.020000005612242334), ('interested', 0.020000003982861031), ('does', 0.02000000396885249), ('help', 0.020000003841928491), ('thanks', 0.020000003826078971)]
Topic 17:
[('good', 267.01579667437733), ('course', 52.341594599393694), ('think', 52.244914198589008), ('won', 42.812138716661934), ('recommend', 32.931297913874033)]
Topic 18:
[('ve', 206.386225807263), ('claim', 56.717933837112078), ('word', 36.596433703514506), ('source', 30.935951079494405), ('buying', 27.979924635594447)]
Topic 19:
[('windows', 183.49738505084102), ('file', 169.80241752359606), ('version', 164.30883149002253), ('software', 162.20792528869711), ('use', 149.45901377787015)]
Topic 20:
[('works', 89.24688169296779), ('try', 78.758190039606546), ('tell', 44.691293832957911), ('tried', 41.158368820532708), ('value', 34.797349653698113)]
Topic 21:
[('goes', 0.020000004007008174), ('person', 0.020000003882128674), ('memory', 0.020000003783234726), ('story', 0.020000003693248815), ('real', 0.02000000345303337)]
Topic 22:
[('information', 107.1303172899469), ('true', 59.394156453891753), ('university', 53.719054766257003), ('1993', 53.093917485670254), ('research', 52.564985852533752)]
Topic 23:
[('key', 190.44018368830103), ('chip', 85.714804498928928), ('encryption', 83.465289427998329), ('clipper', 81.444502447260945), ('keys', 81.145662710046111)]
Topic 24:
[('people', 576.41677396978491), ('just', 513.46066166923038), ('don', 496.780584928167), ('like', 424.83380974624475), ('know', 384.18793964550719)]
Topic 25:
[('drive', 194.9646457867224), ('10', 192.22367687470137), ('55', 174.22645368378844), ('disk', 142.31577234098603), ('11', 139.1093086871891)]
Topic 26:
[('current', 0.020000005569017956), ('au', 0.020000003838642026), ('problem', 0.020000003818874748), ('university', 0.020000003805121996), ('unless', 0.020000003804516768)]
Topic 27:
[('graphics', 0.020000006290581625), ('edu', 0.020000004862309476), ('3d', 0.020000004471351529), ('sun', 0.020000003912130474), ('start', 0.0200000038796776)]
Topic 28:
[('edu', 363.77868173645402), ('com', 178.57354415800333), ('mail', 148.92222197794803), ('graphics', 124.9953874533785), ('send', 120.79785109418515)]
Topic 29:
[('scsi', 89.918376567374281), ('think', 81.559592400301781), ('need', 73.539705169852624), ('president', 71.360746910671295), ('like', 59.737966793004396)]
Topic 30:
[('edu', 0.02000001767060702), ('mail', 0.02000001318210879), ('graphics', 0.020000009702531567), ('pub', 0.020000008669426562), ('work', 0.020000006967443758)]
Topic 31:
[('stuff', 84.598053017142732), ('post', 75.829461043139901), ('want', 74.806193924396467), ('know', 59.839587851375441), ('folks', 41.225869642652995)]
Topic 32:
[('blood', 38.860591381772615), ('rules', 38.293424571830698), ('areas', 31.4465139038372), ('break', 27.470683312634211), ('nature', 12.874662757884771)]
Topic 33:
[('gm', 62.471441288681753), ('vs', 61.835939835684968), ('john', 37.026619989856854), ('copies', 32.507426381469955), ('st', 30.30753995390813)]
Topic 34:
[('use', 124.33095256564441), ('power', 117.86228387172318), ('apple', 50.054647130779024), ('wondering', 39.670832909787286), ('advance', 36.852814419912292)]
Topic 35:
[('force', 0.020000002375862393), ('32', 0.020000002229520204), ('men', 0.020000002225852343), ('carry', 0.020000002225794615), ('wish', 0.020000002222442081)]
Topic 36:
[('heard', 88.46540593188584), ('seen', 73.080830084027212), ('likely', 34.109610690795733), ('science', 31.900754094451962), ('method', 26.000194081070944)]
Topic 37:
[('space', 161.86168218093769), ('10', 89.07869743103933), ('new', 82.114744241442637), ('health', 73.829776471548229), ('earth', 72.015133213236339)]
Topic 38:
[('god', 354.21827459519375), ('jesus', 123.90191031972314), ('church', 94.020201274332138), ('faith', 69.645771508562163), ('life', 46.073800590583289)]
Topic 39:
[('probably', 176.08442212175626), ('memory', 0.020000180712871362), ('free', 0.020000015702580667), ('used', 0.02000000985464288), ('copies', 0.020000009283006489)]
Topic 40:
[('window', 95.490161977793107), ('place', 52.980470032845297), ('manager', 39.982466323115325), ('2nd', 11.626915295652985), ('contains', 2.4549769341296788)]
Topic 41:
[('don', 0.020000011005744209), ('mean', 0.020000007699003411), ('say', 0.020000007326055991), ('want', 0.020000007223567807), ('like', 0.020000007067876632)]
Topic 42:
[('thanks', 184.96033335527551), ('wonder', 37.156222875068565), ('communications', 12.771604799240169), ('san', 12.468242971986742), ('religious', 4.0680100358754157)]
Topic 43:
[('cost', 57.567726447467983), ('soon', 45.104912175526955), ('term', 38.554843232706702), ('built', 35.19538917691186), ('maybe', 32.56395268567811)]
Topic 44:
[('bit', 65.729437447851538), ('pretty', 44.270538739515061), ('mode', 43.560192480099317), ('cd', 36.525268104620302), ('kind', 33.225956115804884)]
Topic 45:
[('hit', 46.882032849395102), ('condition', 39.997652494807284), ('defense', 37.313287129643825), ('single', 37.007876014356434), ('pick', 35.936121167177077)]
Topic 46:
[('car', 188.27223766869079), ('cars', 70.632392435918732), ('better', 65.125829191728016), ('oil', 59.942865211072572), ('engine', 59.823681780534137)]
Topic 47:
[('year', 178.63256600314702), ('new', 108.11199323216483), ('years', 90.720237270059826), ('old', 85.059690444731956), ('ago', 65.202397824129065)]
Topic 48:
[('government', 157.70917932932065), ('law', 142.90459758849028), ('state', 75.873232633992075), ('section', 74.689095612321367), ('gun', 72.529645118966982)]
Topic 49:
[('people', 111.58410041453214), ('said', 107.60680838894234), ('israel', 103.44405550413748), ('women', 72.88063335435173), ('armenians', 70.796837041231555)]

There seem to be a lot of noisy words, make sure you remove them in your analysis.

In [106]:
doc_topic_dist = lda.transform(tf_data_samples) ## Getting the topic distribution in each document. 
In [110]:
doc_topic_dist[23] ## This gives the topic distribution in the 24th document. 
Out[110]:
array([ 0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.08566163,  0.33900922,
        0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.03662343,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.00066667,  0.00066667,
        0.00066667,  0.00066667,  0.00066667,  0.50803906,  0.00066667])
In [118]:
pd.Series(doc_topic_dist[23]).idxmax() # This gives you the topic with the highest contribution. Here it is the 49th(48+1) topic. 
Out[118]:
48
In [112]:
pd.Series(doc_topic_dist[23]).max() # This gives you the topic contribution. 
Out[112]:
0.50803906424673473