Spaces:
Sleeping
Sleeping
File size: 2,265 Bytes
404d784 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import pandas as pd
from Bio.Seq import Seq
import os
def is_valid_sequence(dna_seq: str) -> bool:
"""
Applies a series of validation checks to a DNA sequence.
Args:
dna_seq (str): The DNA sequence to validate.
Returns:
bool: True if the sequence is valid, False otherwise.
"""
if len(dna_seq) % 3 != 0:
return False
if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
return False
if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
return False
codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)]
if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
return False
if not all(c in 'ATGC' for c in dna_seq.upper()):
return False
return True
def main():
"""
Main function to process and validate E. coli gene data.
"""
if not os.path.exists('data'):
os.makedirs('data')
print("Loading data from CSV files...")
df_all = pd.read_csv("data/CAI.csv", header=0, names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3'])
df_high_cai = pd.read_csv("data/Database 3_4300 gene.csv", header=0, names=['dna_sequence'])
high_cai_sequences = set(df_high_cai['dna_sequence'])
validated_genes = []
for index, row in df_all.iterrows():
gene_id = row['gene_id']
dna_sequence = str(row['dna_sequence'])
if is_valid_sequence(dna_sequence):
protein_sequence = str(Seq(dna_sequence).translate())
is_high_cai = dna_sequence in high_cai_sequences
validated_genes.append({
'gene_id': gene_id,
'dna_sequence': dna_sequence,
'protein_sequence': protein_sequence,
'cai_score': row.get('cai_score', None),
'is_high_cai': is_high_cai
})
df_processed = pd.DataFrame(validated_genes)
output_path = 'data/ecoli_processed_genes.csv'
df_processed.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")
print(f"Total validated genes: {len(df_processed)}")
if __name__ == "__main__":
main()
|