 ##### Untitled

Guest 34 4th Dec, 2019

PYTHON 4.24 KB
```                                           ```
"""
This file contains code for lab 5a.

Simply call the app to run it.

'python main.py'

Kevin Tran 000375580, Mohawk College, 2019

Kevin Tran 000375580

----------------------------------------

Using 'gene expression cancer RNA-Seq Data Set data set',

Using 100% of the data, after the first task, an elbow observed at k=5 and k=4,
but k=5 seems to be the more accurate one
This was conclusive and correlates with the dataset classifications

(using 100% of the data)

Found very pure clusters, only a few outliers in each cluster.

cluster: 0
Number of KIRC: 
Number of BRCA: 
cluster: 1
Number of KIRC: 
Number of BRCA: 
cluster: 2
Number of KIRC: 
Number of BRCA: 
cluster: 3
Number of KIRC: 
Number of BRCA: 
cluster: 4
Number of KIRC: 
Number of BRCA: 

inertia_
14783575.690876452

"""

import numpy as np
import csv
from sklearn.cluster import KMeans
import random
import argparse
import sys
import csv
import matplotlib.pyplot as plt

def main():

print("Welcome to Lab 5a\n")
print("Kevin Tran 000375580\n")
print("----------------------------------------\n")

df = "data.csv"
lf = "labels.csv"

# Read Data and Label csv

iArr = []
kArr = []

runs = 3
kmax = 11

k = 2

# Run Kmeans for Kvalues 2-10
while(k < kmax):
kmeans = KMeans(n_clusters=k, n_init=runs).fit(data)
print('k: ' + str(k) + ' inertia: ' + str(kmeans.inertia_))
iArr.append(kmeans.inertia_)
kArr.append(k)
k += 1

#graph(kArr, iArr)

# k = 5 is the most obvious choice here.

k = 5
kmeans = KMeans(n_clusters=k).fit(data)

# For each cluster, count the occurence of each label
for x in range(0 , len(kmeans.cluster_centers_)):
print("cluster: " + str(x))
print("Number of KIRC: " + str(sum(labels[kmeans.labels_== x] == ['KIRC'])))
print("Number of BRCA: " + str(sum(labels[kmeans.labels_== x] == ['BRCA'])))

print('inertia_')
print(kmeans.inertia_)

def graph(x, y):
"""This function creates a graph based on x and y 1D array of values"""
plt.plot(x, y)
plt.xlabel('Lab 5 - Bluster and muster some clusters')
plt.ylabel('k values')
plt.title('inertia')
plt.show()

"""This function reads in the data file, with a filename as an input paramter"""

raw = []
data = []

with open(file) as csv_file:
newRow = []
for item in row:
newRow.append(float(item))

raw.append(row)

data = np.array(raw)

return data

"""This function reads in the labels file, with a filename as an input paramter"""
labels = []

with open(file) as csv_file:
newRow = []
for item in row:
newRow.append(item)

labels.append(row)

labels = np.array(labels)

return labels

# Call the app
if __name__ == "__main__":
main()
```
```