This project involves building a classification model to predict the functional family of genes based on DNA sequences. The model is trained on human DNA and tested on DNA sequences from chimpanzees and dogs, with various encoding methods. The model performs well on human and chimpanzee DNA, but struggles with dog DNA due to genetic differences. This project is inspired by Kaggle and was mainly used to highlight the importance of genetic diversity in model performance and showcase the application of Machine Learning in DNA sequence analysis.
from Bio import SeqIO import matplotlib.pyplot as plt import pandas as pd import numpy as np import os import re from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# an example DNA sequence for sequence in SeqIO.parse('data/dna-sequence-dataset/example_dna.fa', "fasta"): print("Sequence ID:", sequence.id) print("Sequence length:", len(sequence)) print("-> [ {} ]".format(sequence.seq)) break
Sequence ID: ENST00000435737.5 Sequence length: 390 -> [ ATGTTTCGCATCACCAACATTGAGTTTCTTCCCGAATACCGACAAAAGGAGTCCAGGGAATTTCTTTCAGTGTCACGGACTGTGCAGCAAGTGATAAACCTGGTTTATACAACATCTGCCTTCTCCAAATTTTATGAGCAGTCTGTTGTTGCAGATGTCAGCAACAACAAAGGCGGCCTCCTTGTCCACTTTTGGATTGTTTTTGTCATGCCACGTGCCAAAGGCCACATCTTCTGTGAAGACTGTGTTGCCGCCATCTTGAAGGACTCCATCCAGACAAGCATCATAAACCGGACCTCTGTGGGGAGCTTGCAGGGACTGGCTGTGGACATGGACTCTGTGGTACTAAATGAAGTCCTGGGGCTGACTCTCATTGTCTGGATTGACTGA ]
def string_to_array(sequence): sequence = sequence.lower() sequence = re.sub('[^acgt]', '', sequence) sequence = np.array(list(sequence)) return sequence # create a label encoder with the alphabet {a,c,g,t} label_encoder = LabelEncoder() label_encoder.fit(np.array(['a', 'c', 'g', 't']))
def ordinal_encode(array): integer_encoded = label_encoder.transform(array) float_encoded = integer_encoded.astype(float) float_encoded[float_encoded == 0] = 0.25 # A float_encoded[float_encoded == 1] = 0.50 # C float_encoded[float_encoded == 2] = 0.75 # G float_encoded[float_encoded == 3] = 1.00 # T float_encoded[float_encoded == 4] = 0.00 # any other letter return float_encoded # test the program on a short sequence: test_sequence = 'TTCAGCCAGTG' ordinal_encode(string_to_array(test_sequence))
array([1. , 1. , 0.5 , 0.25, 0.75, 0.5 , 0.5 , 0.25, 0.75, 1. , 0.75])
def one_hot_encode(string): integer_encoded = label_encoder.transform(string) onehot_encoder = OneHotEncoder(sparse=False, dtype=int) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoded = onehot_encoder.fit_transform(integer_encoded) onehot_encoded = np.delete(onehot_encoded, -1, 1) return onehot_encoded # test the program on a short sequence: test_sequence = 'GAATTCTCGAA' one_hot_encode(string_to_array(test_sequence))
array([[0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0]])
def kmers(sequence, k): return [sequence[index:index+k].upper() for index in range(len(sequence) - k + 1)] # test the program on a short sequence: test_sequence = 'ATGCATGCA' kmers(test_sequence, 5)
['ATGCA', 'TGCAT', 'GCATG', 'CATGC', 'ATGCA']
words = kmers(test_sequence, 5) sentence = ' '.join(words) sentence
'ATGCA TGCAT GCATG CATGC ATGCA'
sequence1 = 'TCTCACACATGTGCCAATCACTGTCACCC' sequence2 = 'GTGCCCAGGTTCAGTGAGTGACACAGGCAG' phrase1 = ' '.join(kmers(sequence1, 6)) phrase2 = ' '.join(kmers(sequence2, 6))
# Next, we build the bag-of-words model cv = CountVectorizer() X = cv.fit_transform([phrase, phrase1, phrase2]).toarray() X
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0], [0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1]], dtype=int64)
human_dna = pd.read_table('data/dna-sequence-dataset/human.txt') human_dna.head()
-------------------- sequence ------------------- class 0 ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA... 4 1 ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG... 4 2 ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG... 3 3 ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG... 3 4 ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG... 3
human_dna['class'].value_counts().sort_index().plot.bar() plt.title("Distribution of classes in human DNA")
chimp_dna = pd.read_table('data/dna-sequence-dataset/chimpanzee.txt') chimp_dna.head()
-------------------- sequence ------------------- class 0 ATGCCCCAACTAAATACCGCCGTATGACCCACCATAATTACCCCCA... 4 1 ATGAACGAAAATCTATTCGCTTCATTCGCTGCCCCCACAATCCTAG... 4 2 ATGGCCTCGCGCTGGTGGCGGTGGCGACGCGGCTGCTCCTGGAGGC... 4 3 ATGGCCTCGCGCTGGTGGCGGTGGCGACGCGGCTGCTCCTGGAGGC... 4 4 ATGGGCAGCGCCAGCCCGGGTCTGAGCAGCGTGTCCCCCAGCCACC... 6
chimp_dna['class'].value_counts().sort_index().plot.bar() plt.title("Distribution of classes in chimpanzee DNA")
dog_dna = pd.read_table('data/dna-sequence-dataset/dog.txt') dog_dna.head()
-------------------- sequence ------------------- class 0 ATGCCACAGCTAGATACATCCACCTGATTTATTATAATCTTTTCAA... 4 1 ATGAACGAAAATCTATTCGCTTCTTTCGCTGCCCCCTCAATAATAG... 4 2 ATGGAAACACCCTTCTACGGCGATGAGGCGCTGAGCGGCCTGGGCG... 6 3 ATGTGCACTAAAATGGAACAGCCCTTCTACCACGACGACTCATACG... 6 4 ATGAGCCGGCAGCTAAACAGAAGCCAGAACTGCTCCTTCAGTGACG... 0
dog_dna['class'].value_counts().sort_index().plot.bar() plt.title("Distribution of classes in dog DNA")
def kmers(seq, size=6): return [seq[x:x+size].upper() for x in range(len(seq) - size + 1)] # our training data sequences are converted into hexamers # the same process is then applied to different species human_dna['words'] = human_dna.apply(lambda x: kmers(x['sequence']), axis=1) human_dna = human_dna.drop('sequence', axis=1) chimp_dna['words'] = chimp_dna.apply(lambda x: kmers(x['sequence']), axis=1) chimp_dna = chimp_dna.drop('sequence', axis=1) dog_dna['words'] = dog_dna.apply(lambda x: kmers(x['sequence']), axis=1) dog_dna = dog_dna.drop('sequence', axis=1)
human_dna.head()
class words 0 4 [ATGCCC, TGCCCC, GCCCCA, CCCCAA, CCCAAC, CCAAC... 1 4 [ATGAAC, TGAACG, GAACGA, AACGAA, ACGAAA, CGAAA... 2 3 [ATGTGT, TGTGTG, GTGTGG, TGTGGC, GTGGCA, TGGCA... 3 3 [ATGTGT, TGTGTG, GTGTGG, TGTGGC, GTGGCA, TGGCA... 4 3 [ATGCAA, TGCAAC, GCAACA, CAACAG, AACAGC, ACAGC...
human_text = list(human_dna['words']) for i in range(len(human_text)): human_text[i] = ' '.join(human_text[i]) chimp_text = list(chimp_dna['words']) for i in range(len(chimp_text)): chimp_text[i] = ' '.join(chimp_text[i]) dog_text = list(dog_dna['words']) for i in range(len(dog_text)): dog_text[i] = ' '.join(dog_text[i]) # the labels are separated y_human = human_dna.iloc[:, 0].values y_chimp = chimp_dna.iloc[:, 0].values y_dog = dog_dna.iloc[:, 0].values
y_human
cv = CountVectorizer(ngram_range=(4,4)) X_human = cv.fit_transform(human_text) X_chimp = cv.transform(chimp_text) X_dog = cv.transform(dog_text)
print("For humans, we have {} genes converted into uniform-length vectors for counting 4-gram hexamers.".format(X_human.shape[0])) print("For chimpanzees, we have {} genes converted into uniform-length vectors for counting 4-gram hexamers.".format(X_chimp.shape[0])) print("For dogs, we have {} genes converted into uniform-length vectors for counting 4-gram hexamers.".format(X_dog.shape[0]))
For humans, we have 4380 genes converted into uniform-length vectors for counting 4-gram hexamers. For chimpanzees, we have 1682 genes converted into uniform-length vectors for counting 4-gram hexamers. For dogs, we have 820 genes converted into uniform-length vectors for counting 4-gram hexamers.
# Split the human DNA data into training and test subsets X_train, X_test, y_train, y_test = train_test_split(X_human, y_human, test_size=0.20, random_state=42)
classifier = MultinomialNB(alpha=0.1) classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Confusion Matrix for predictions on human DNA sequences:\n") print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted'))) def metrics(y_test, y_predicted): accuracy = accuracy_score(y_test, y_predicted) precision = precision_score(y_test, y_predicted, average='weighted') recall = recall_score(y_test, y_predicted, average='weighted') f1 = f1_score(y_test, y_predicted, average='weighted') return accuracy, precision, recall, f1 accuracy, precision, recall, f1 = metrics(y_test, y_pred) print("\nAccuracy = {} \nPrecision = {} \nRecall = {} \nF1 Score = {}".format(accuracy, precision, recall, f1))
Confusion Matrix for predictions on human DNA sequences: Predicted 0 1 2 3 4 5 6 Actual 0 99 0 0 0 1 0 2 1 0 104 0 0 0 0 2 2 0 0 78 0 0 0 0 3 0 0 0 124 0 0 1 4 1 0 0 0 143 0 5 5 0 0 0 0 0 51 0 6 1 0 0 1 0 0 263 Accuracy = 0.9840182648401826 Precision = 0.984290543482443 Recall = 0.9840182648401826 F1 Score = 0.9840270014702487
y_pred_chimp = classifier.predict(X_chimp)
print("Confusion Matrix for predictions on chimpanzee DNA sequences:\n") print(pd.crosstab(pd.Series(y_chimp, name='Actual'), pd.Series(y_pred_chimp, name='Predicted'))) accuracy, precision, recall, f1 = metrics(y_chimp, y_pred_chimp) print("\nAccuracy = {} \nPrecision = {} \nRecall = {} \nF1 Score = {}".format(accuracy, precision, recall, f1))
Confusion Matrix for predictions on chimpanzee DNA sequences: Predicted 0 1 2 3 4 5 6 Actual 0 232 0 0 0 0 0 2 1 0 184 0 0 0 0 1 2 0 0 144 0 0 0 0 3 0 0 0 227 0 0 1 4 2 0 0 0 254 0 5 5 0 0 0 0 0 109 0 6 0 0 0 0 0 0 521 Accuracy = 0.9934601664684899 Precision = 0.993551028649631 Recall = 0.9934601664684899 F1 Score = 0.993453334125236
y_pred_dog = classifier.predict(X_dog)
print("Confusion Matrix for predictions on dog DNA sequences:\n") print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_pred_dog, name='Predicted'))) accuracy, precision, recall, f1 = metrics(y_dog, y_pred_dog) print("\nAccuracy = {} \nPrecision = {} \nRecall = {} \nF1 Score = {}".format(accuracy, precision, recall, f1))
Confusion Matrix for predictions on dog DNA sequences: Predicted 0 1 2 3 4 5 6 Actual 0 127 0 0 0 0 0 4 1 0 63 0 0 1 0 11 2 0 0 49 0 1 0 14 3 1 0 0 81 2 0 11 4 4 0 0 1 126 0 4 5 4 0 0 0 1 53 2 6 0 0 0 0 0 0 260 Accuracy = 0.925609756097561 Precision = 0.9340667154775472 Recall = 0.925609756097561 F1 Score = 0.9251228556957126