Import the necessary libraries: CountVectorize and later LDA

In [30]:
from sklearn.feature_extraction.text import CountVectorizer ## LDA is a probability distribution over words, so we need the word count

Setup the vocabulory

In [31]:
## Here we are tokening our text
num_features = 1000 # you can change this to which ever value you think is right. 
tf_vectorizer = CountVectorizer(max_df=500, min_df=10, max_features=num_features, stop_words='english')
In [32]:
from sklearn.decomposition import LatentDirichletAllocation ### Importing the LDA library

Using the newscorp data. You could also import your csv files here.

In [33]:
from sklearn.datasets import fetch_20newsgroups
In [34]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
n_samples = 2000
data_samples = dataset.data[:n_samples]
In [35]:
tf_data_samples = tf_vectorizer.fit_transform(data_samples) # Tokenizing and getting the feature counts. 
tf_feature_names = tf_vectorizer.get_feature_names()
In [36]:
tf_data_samples ## This is a matrix
Out[36]:
<2000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 52084 stored elements in Compressed Sparse Row format>

Setting up your LDA model

In [53]:
num_topics = 20 # Tune this parameter. Also consider tuning the parameters max_iter, learning_offset, and random_state. 
#num_topics2 = 100
#num_topics3 = 150
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=100, learning_method='online', learning_offset=50.,random_state=1).fit(tf_data_samples)
In [54]:
lda.score(tf_data_samples) # Log-likelihood score. Higher the better (magnitude). This basically mean, given the parameters we have chosen, how best are we able to explain the data. 
Out[54]:
-522239.91518911661
In [39]:
def print_topics(model, vectorizer, top_n=5):    #### This is the code to print the topics. top_n can be changed. 
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
In [55]:
print_topics(lda,tf_vectorizer) ### Printing all the topics, and the top 5 words in them. 
Topic 0:
[('game', 142.86576505437301), ('team', 102.82650246188074), ('win', 92.750693115253512), ('play', 83.022559790109284), ('year', 72.375347830016821)]
Topic 1:
[('power', 163.9515961125538), ('stuff', 78.687280535774448), ('engine', 60.704428842049111), ('gas', 41.704231669406042), ('air', 36.411952347873878)]
Topic 2:
[('vs', 62.941858309261733), ('cases', 48.721291356564421), ('hear', 29.979740350110276), ('effective', 21.902177323298869), ('dr', 18.070117017771228)]
Topic 3:
[('accept', 43.517900707921108), ('pro', 37.249603639123009), ('wonder', 37.18556160027719), ('choice', 32.73761509200839), ('gov', 19.76404539723017)]
Topic 4:
[('new', 0.050000006025692136), ('right', 0.05000000587370558), ('answer', 0.05000000562174152), ('ll', 0.050000005406962457), ('work', 0.050000005225275541)]
Topic 5:
[('10', 184.24631670552702), ('55', 174.2564537427177), ('11', 155.63252582051453), ('15', 104.77086379178229), ('17', 101.93983462928104)]
Topic 6:
[('drive', 208.62668860528396), ('disk', 166.60961966943725), ('drives', 128.87145393952753), ('hard', 128.11604236514981), ('scsi', 107.06939148845437)]
Topic 7:
[('church', 93.988251971930396), ('mary', 34.717679582132043), ('st', 34.175534028859794), ('sin', 34.084118221004012), ('nhl', 32.531257056989574)]
Topic 8:
[('israel', 102.44916004153555), ('jews', 68.665529404274807), ('jewish', 59.908008218375471), ('israeli', 43.07564622434991), ('attacks', 41.468095104159474)]
Topic 9:
[('00', 94.522341076897249), ('new', 91.46996574766554), ('interested', 75.038903291556764), ('sale', 50.18173855730052), ('thanks', 46.430502635268148)]
Topic 10:
[('program', 96.400762075232265), ('information', 81.391579586695599), ('university', 56.744220166187247), ('true', 51.308941857231282), ('research', 50.972872466542078)]
Topic 11:
[('point', 85.415473447139334), ('line', 77.465987046350634), ('current', 56.044975154131592), ('points', 51.930963353502783), ('normal', 38.995447761011157)]
Topic 12:
[('use', 226.89387414672723), ('like', 218.74398305267789), ('know', 198.25357505624413), ('windows', 172.43811336302483), ('problem', 171.35977115268972)]
Topic 13:
[('people', 630.70086671411559), ('just', 506.07981892872272), ('don', 480.35074629628036), ('like', 418.21792723387631), ('think', 417.96277241032482)]
Topic 14:
[('key', 193.89750938134165), ('government', 120.08431812454604), ('chip', 104.02412712787037), ('use', 92.618842324538846), ('encryption', 83.495289447296599)]
Topic 15:
[('space', 181.07300285699313), ('years', 99.530972958886338), ('new', 93.788002950175411), ('earth', 81.98759153986596), ('000', 76.035485807158722)]
Topic 16:
[('window', 82.216089912260173), ('color', 76.60974474267762), ('screen', 46.296030359181742), ('printer', 43.768488259518847), ('does', 35.267397770427685)]
Topic 17:
[('car', 181.14223281328569), ('bike', 78.203903031695106), ('cars', 69.530514601897011), ('insurance', 57.60897533204755), ('year', 47.25010718342886)]
Topic 18:
[('law', 113.80371737946284), ('government', 102.60769188243164), ('mr', 96.761677038096025), ('state', 74.725443438010558), ('gun', 73.787363729311522)]
Topic 19:
[('edu', 389.8021571890593), ('com', 172.29031001846388), ('mail', 160.38335573624946), ('graphics', 158.64695301883552), ('send', 147.08765388326071)]

If you carefully observe the list above, you will see that there are a lot of non-informative words. It is up to you to figure out a way to exclude these words.

In [56]:
doc_topic_dist = lda.transform(tf_data_samples) ## Getting the topic distribution in each document. 
In [57]:
doc_topic_dist[7] ## This gives the topic distribution in the 24th document. 
Out[57]:
array([ 0.00192308,  0.00192308,  0.00192308,  0.00192308,  0.00192308,
        0.03791895,  0.00192308,  0.00192308,  0.04793545,  0.00192308,
        0.00192308,  0.00192308,  0.00192308,  0.88145329,  0.00192308,
        0.00192308,  0.00192308,  0.00192308,  0.00192308,  0.00192308])
In [43]:
import pandas as pd
In [58]:
pd.Series(doc_topic_dist[7]).idxmax() # This gives you the topic with the highest contribution. Remember that indexing starts with 0 and not 1. 
Out[58]:
13
In [59]:
pd.Series(doc_topic_dist[7]).max() # This gives you the topic contribution. 
Out[59]:
0.88145329262458338
In [ ]: