In [1]:
import pandas as pd
import math
In [2]:
wikipedia_example = {'Class' : ['C','C','C','C','NC','NC','NC'], 'Mut1' : [1,1,1,0,0,0,1], 'Mut2' : [1,1,0,1,0,1,1], 'Mut3':[1,0,1,1,0,0,0], 'Mut4' : [0,1,1,0,0,0,0]}
dataset = pd.DataFrame(data=wikipedia_example)
In [3]:
print(math.log2(1))
0.0
In [4]:
dataset
Out[4]:
Class Mut1 Mut2 Mut3 Mut4
0 C 1 1 1 0
1 C 1 1 0 1
2 C 1 0 1 1
3 C 0 1 1 0
4 NC 0 0 0 0
5 NC 0 1 0 0
6 NC 1 1 0 0

To count the number of rows with 'C'

In [5]:
(dataset['Class'] == 'C').values.sum()
Out[5]:
4

To count the number of rows with 'NC'

In [6]:
(dataset['Class'] == 'NC').values.sum()
Out[6]:
3
In [7]:
len(dataset.index)
Out[7]:
7
In [8]:
def calc_entropy ( sample ) :
    # Assumes sample is a pandas dataframe
    # Assumes  classes is a list of classes appearing in the 'Class' column of the dataset
    prob_dict = {}
    #Return all the unique classes
    classes = sample['Class'].unique()
    total_samples = len(sample.index)
    for i in classes : 
        count = (sample['Class'] == i).values.sum()
        #print ("class = ",i, "count = ", count)
        prob_dict[i] = count/total_samples
    total_entropy = 0.0
    for i in classes:
        total_entropy += -1 * (prob_dict[i]) * math.log2 ( prob_dict[i])
    return (total_entropy)
In [9]:
calc_entropy (dataset)
Out[9]:
0.9852281360342515
In [10]:
def information_gain (sample , feature) :
    # Assume that sample is a pandas data fram
    # Assume that feature is some column appearing in sample.
    feature_values = sample[feature].unique()
 #   entropies = {}
    sample_size = len(sample.index)
#    print("sample_size = ",sample_size)
    weighted_relative = 0.0
    for i in feature_values : 
        print("\n\n")
        print ("Calculating for ",i)
        new_sample = sample.copy()
        new_sample = new_sample[ new_sample[feature] == i]
        new_sample_size = len(new_sample.index)
        print("new_sample_size = ",new_sample_size)
        print(new_sample)
        relative_entropy = calc_entropy(new_sample)
        print("relative_entropy = ",relative_entropy)
  #      entropies[i] = relative_entropy
        weighted_relative += (new_sample_size/sample_size)*relative_entropy
    return(calc_entropy(sample) - weighted_relative)
#    return(entropies)
    
In [11]:
information_gain(dataset , 'Mut1')


Calculating for  1
new_sample_size =  4
  Class  Mut1  Mut2  Mut3  Mut4
0     C     1     1     1     0
1     C     1     1     0     1
2     C     1     0     1     1
6    NC     1     1     0     0
relative_entropy =  0.8112781244591328



Calculating for  0
new_sample_size =  3
  Class  Mut1  Mut2  Mut3  Mut4
3     C     0     1     1     0
4    NC     0     0     0     0
5    NC     0     1     0     0
relative_entropy =  0.9182958340544896
Out[11]:
0.12808527889139443
In [12]:
information_gain(dataset, 'Mut2')


Calculating for  1
new_sample_size =  5
  Class  Mut1  Mut2  Mut3  Mut4
0     C     1     1     1     0
1     C     1     1     0     1
3     C     0     1     1     0
5    NC     0     1     0     0
6    NC     1     1     0     0
relative_entropy =  0.9709505944546686



Calculating for  0
new_sample_size =  2
  Class  Mut1  Mut2  Mut3  Mut4
2     C     1     0     1     1
4    NC     0     0     0     0
relative_entropy =  1.0
Out[12]:
0.0059777114237740125
In [13]:
information_gain(dataset, 'Mut3')


Calculating for  1
new_sample_size =  3
  Class  Mut1  Mut2  Mut3  Mut4
0     C     1     1     1     0
2     C     1     0     1     1
3     C     0     1     1     0
relative_entropy =  0.0



Calculating for  0
new_sample_size =  4
  Class  Mut1  Mut2  Mut3  Mut4
1     C     1     1     0     1
4    NC     0     0     0     0
5    NC     0     1     0     0
6    NC     1     1     0     0
relative_entropy =  0.8112781244591328
Out[13]:
0.5216406363433185
In [14]:
information_gain(dataset, 'Mut4')


Calculating for  0
new_sample_size =  5
  Class  Mut1  Mut2  Mut3  Mut4
0     C     1     1     1     0
3     C     0     1     1     0
4    NC     0     0     0     0
5    NC     0     1     0     0
6    NC     1     1     0     0
relative_entropy =  0.9709505944546686



Calculating for  1
new_sample_size =  2
  Class  Mut1  Mut2  Mut3  Mut4
1     C     1     1     0     1
2     C     1     0     1     1
relative_entropy =  0.0
Out[14]:
0.2916919971380597
In [15]:
feature_cols = ['Mut1' , 'Mut2' , 'Mut3' , 'Mut4'] 
In [16]:
X = dataset.loc[:,feature_cols]
In [17]:
X
Out[17]:
Mut1 Mut2 Mut3 Mut4
0 1 1 1 0
1 1 1 0 1
2 1 0 1 1
3 0 1 1 0
4 0 0 0 0
5 0 1 0 0
6 1 1 0 0
In [18]:
y = dataset.Class
In [19]:
y
Out[19]:
0     C
1     C
2     C
3     C
4    NC
5    NC
6    NC
Name: Class, dtype: object
In [20]:
from sklearn import tree
In [21]:
clf = tree.DecisionTreeClassifier(criterion="entropy")
In [22]:
clf = clf.fit(X,y)
In [23]:
tree.plot_tree(clf, feature_names = feature_cols , class_names = ['C','NC'])
Out[23]:
[Text(0.6, 0.8333333333333334, 'Mut3 <= 0.5\nentropy = 0.985\nsamples = 7\nvalue = [4, 3]\nclass = C'),
 Text(0.4, 0.5, 'Mut4 <= 0.5\nentropy = 0.811\nsamples = 4\nvalue = [1, 3]\nclass = NC'),
 Text(0.2, 0.16666666666666666, 'entropy = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = NC'),
 Text(0.6, 0.16666666666666666, 'entropy = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = C'),
 Text(0.8, 0.5, 'entropy = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = C')]
In [24]:
!pip install graphviz
Requirement already satisfied: graphviz in ./env/lib/python3.8/site-packages (0.19.1)
In [25]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None , feature_names = feature_cols , class_names = ['C','NC'])
graph = graphviz.Source(dot_data)
In [26]:
graph.render("Hello")
Out[26]:
'Hello.pdf'
In [ ]: