avatar
Untitled

Guest 34 4th Dec, 2019

PYTHON 4.24 KB
                                           
                         """
This file contains code for lab 5a. 

Simply call the app to run it.

'python main.py'

Kevin Tran 000375580, Mohawk College, 2019

Kevin Tran 000375580

----------------------------------------

Using 'gene expression cancer RNA-Seq Data Set data set', 

Using 100% of the data, after the first task, an elbow observed at k=5 and k=4,
but k=5 seems to be the more accurate one
This was conclusive and correlates with the dataset classifications

Second Task
(using 100% of the data)

Found very pure clusters, only a few outliers in each cluster. 


cluster: 0
Number of COAD: [0]
Number of LUAD: [2]
Number of PRAD: [0]
Number of KIRC: [1]
Number of BRCA: [300]
cluster: 1
Number of COAD: [0]
Number of LUAD: [0]
Number of PRAD: [0]
Number of KIRC: [145]
Number of BRCA: [0]
cluster: 2
Number of COAD: [1]
Number of LUAD: [139]
Number of PRAD: [0]
Number of KIRC: [0]
Number of BRCA: [0]
cluster: 3
Number of COAD: [0]
Number of LUAD: [0]
Number of PRAD: [136]
Number of KIRC: [0]
Number of BRCA: [0]
cluster: 4
Number of COAD: [77]
Number of LUAD: [0]
Number of PRAD: [0]
Number of KIRC: [0]
Number of BRCA: [0]

inertia_
14783575.690876452

"""

import numpy as np
import csv
from sklearn.cluster import KMeans
import random
import argparse
import sys
import csv
import matplotlib.pyplot as plt

def main():

    print("Welcome to Lab 5a\n")
    print("Kevin Tran 000375580\n")
    print("----------------------------------------\n")

    
    df = "data.csv"
    lf = "labels.csv"

    # Read Data and Label csv
    data = readDataFile(df)
    labels = readLabelFile(lf)

    iArr = []
    kArr = []

    runs = 3
    kmax = 11

    k = 2

    # Run Kmeans for Kvalues 2-10
    while(k < kmax):
        kmeans = KMeans(n_clusters=k, n_init=runs).fit(data)
        print('k: ' + str(k) + ' inertia: ' + str(kmeans.inertia_))
        iArr.append(kmeans.inertia_)
        kArr.append(k)
        k += 1

    #graph(kArr, iArr)



    # k = 5 is the most obvious choice here.


    # Task 2
    k = 5
    kmeans = KMeans(n_clusters=k).fit(data)

    # For each cluster, count the occurence of each label
    for x in range(0 , len(kmeans.cluster_centers_)): 
        print("cluster: " + str(x))
        print("Number of COAD: " + str(sum(labels[kmeans.labels_== x] == ['COAD'])))
        print("Number of LUAD: " + str(sum(labels[kmeans.labels_== x] == ['LUAD'])))
        print("Number of PRAD: " + str(sum(labels[kmeans.labels_== x] == ['PRAD'])))
        print("Number of KIRC: " + str(sum(labels[kmeans.labels_== x] == ['KIRC'])))
        print("Number of BRCA: " + str(sum(labels[kmeans.labels_== x] == ['BRCA'])))
   
    print('inertia_')
    print(kmeans.inertia_)
    
def graph(x, y):
    """This function creates a graph based on x and y 1D array of values"""
    plt.plot(x, y)
    plt.xlabel('Lab 5 - Bluster and muster some clusters')
    plt.ylabel('k values')
    plt.title('inertia')
    plt.show()

def readDataFile(file):
    """This function reads in the data file, with a filename as an input paramter"""

    raw = []
    data = []

    with open(file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            newRow = []
            for item in row:
                newRow.append(float(item))

            raw.append(row)

    data = np.array(raw)
 
    return data

def readLabelFile(file):
    """This function reads in the labels file, with a filename as an input paramter"""
    labels = []

    with open(file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            newRow = []
            for item in row:
                newRow.append(item)

            labels.append(row)

    labels = np.array(labels)
 
    return labels

# Call the app
if __name__ == "__main__":
    main()
                      
                                       
To share this paste please copy this url and send to your friends
RAW Paste Data
Recent Pastes