Always remember to import libraries that are needed to work on your data. In this course we will work with 4 main libraries: Scikit learn, numpy, scipy, and Pandas. An easy way to install all of them in one go is to get Anaconda: https://anaconda.org/anaconda/python. When you install Anaconda you will also get access to Jupyter notebook using which you can run all the below commands. An alternative, which I highly recommend is to first install Anaconda and run your code using editors such as Text Wrangle, Sublime Text, Notepad ++, or Vim editor. Doing the later will highly be beneficial for you after this course is over (but it does have a learning curve).
pip3 install pandas ## This will install the pandas library. Do this on your terminal and not on jupyter.
To install pip, follow the directions on this link: https://pip.pypa.io/en/stable/installing/
import pandas as pd
import numpy as np
import scipy
print("Hello world!") ## This is something I typically write before anything to check if the environment is running.
df = pd.read_csv("/Users/boY/Desktop/code_me/text_mining/summer_2019/ManualSentimentClassifier/train.csv",encoding='ISO-8859-1') # there are different encoding and for HW1 this particular encoding works well.
df.head(3)
df.columns ## To print out the column names
df.shape ## To know the dimensions of the data frame. The output should be read as [rows,columns]
[rows,columns] = df.shape # Print both values out to check the output yourself.
df.describe()
df['class'].unique() ## To know the unique labels.
df['class'].value_counts() ## To know the distribution of the labels.
from sklearn import preprocessing ### Importing a preprocessor to convert the labels in the target class.
train_class_y = ['negative','positive']
le = preprocessing.LabelEncoder() ## Label encoder does the trick.
le.fit(train_class_y) ## We are fitting the categories now.
train_y = le.transform(df['class']) ## Here we are transforming our labels to 0's and 1's. Basically binary values.
train_y ## These are out labels now. The output is an array of the labels (binary values).
le.transform(['positive','negative','positive']) ### Just to check. "le" is the object we have created which transforms the data.
le.inverse_transform([0,1,1]) ### Doing an inverse of the transformation
train_x = df['text']
train_x.shape ## Just making sure that we have what we want.
from sklearn.feature_extraction.text import CountVectorizer ## CountVectorizer gives you the bag of words.
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_x)
X_train_counts.shape
X_train_counts.toarray()
np.count_nonzero(X_train_counts.toarray()) ## Just checking for non zeros.
count_vect.get_feature_names()
from sklearn.feature_extraction.text import TfidfVectorizer ## Importing the library that will help us do this.
tf = TfidfVectorizer(min_df=1,stop_words='english',max_features=5000) ## Ask yourself, why min_df =1? We are using english stopwords.
####max_features=3000
train_x_tfidf = tf.fit_transform(train_x)
tf.get_feature_names() ## Be careful to check your feature names with tf and not with train_x_tfidf
train_x_tfidf_array = train_x_tfidf.toarray()
train_x_tfidf_array[0]
tf.inverse_transform(train_x_tfidf_array[0]) ## just to check what all features are there.
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1.0) # Check what this alpha value is. You have already learnt most of the math to understand this.
mnb.fit(train_x_tfidf_array,train_y)
test_df = pd.read_csv("/Users/boY/Desktop/code_me/text_mining/summer_2019/ManualSentimentClassifier/test.csv",encoding='ISO-8859-1')
test_x_tfidf = tf.transform(test_df['text']) ## Where did we get the tf from?
test_x_tfidf_array = test_x_tfidf.toarray()
test_y = le.transform(test_df['class']) ## Where did we get "le" from?
test_y.shape
test_x_tfidf_array.shape
predictions = mnb.predict(test_x_tfidf_array)
predictions.shape
count = 0
for i in range (len(predictions)):
if predictions[i]==test_y[i]:
count=count+1
count/2000
from sklearn.linear_model import LogisticRegression # load the library
log_reg = LogisticRegression(C=4.0)
log_reg.fit(train_x_tfidf_array,train_y)
log_reg.score(train_x_tfidf_array,train_y) # running it on the train set itself.
log_reg.score(test_x_tfidf_array,test_y) # running it on the test set.
from sklearn import svm
clf = svm.SVC(C=1.0,degree=1,kernel='linear')
clf.fit(train_x_tfidf_array,train_y)
predicted = clf.predict(test_x_tfidf_array)
count = 0
for i in range (len(predicted)):
if predicted[i]==test_y[i]:
count=count+1
count
count/2000
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth=10,n_estimators=100,min_samples_leaf=2)
#max_depth=10,n_estimators=100,min_samples_leaf=2
forest.fit(train_x_tfidf_array,train_y)
forest.score(train_x_tfidf_array,train_y)
forest.score(test_x_tfidf_array,test_y)
forest_predictions = forest.predict(test_x_tfidf_array)
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, forest_predictions) ## We have done this in class.