Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from Bio.Seq import Seq | |
| import os | |
| def is_valid_sequence(dna_seq: str) -> bool: | |
| """ | |
| Applies a series of validation checks to a DNA sequence. | |
| Args: | |
| dna_seq (str): The DNA sequence to validate. | |
| Returns: | |
| bool: True if the sequence is valid, False otherwise. | |
| """ | |
| if len(dna_seq) % 3 != 0: | |
| return False | |
| if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')): | |
| return False | |
| if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')): | |
| return False | |
| codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)] | |
| if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons): | |
| return False | |
| if not all(c in 'ATGC' for c in dna_seq.upper()): | |
| return False | |
| return True | |
| def main(): | |
| """ | |
| Main function to process and validate E. coli gene data. | |
| """ | |
| if not os.path.exists('data'): | |
| os.makedirs('data') | |
| print("Loading data from CSV files...") | |
| df_all = pd.read_csv("data/CAI.csv", header=0, names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3']) | |
| df_high_cai = pd.read_csv("data/Database 3_4300 gene.csv", header=0, names=['dna_sequence']) | |
| high_cai_sequences = set(df_high_cai['dna_sequence']) | |
| validated_genes = [] | |
| for index, row in df_all.iterrows(): | |
| gene_id = row['gene_id'] | |
| dna_sequence = str(row['dna_sequence']) | |
| if is_valid_sequence(dna_sequence): | |
| protein_sequence = str(Seq(dna_sequence).translate()) | |
| is_high_cai = dna_sequence in high_cai_sequences | |
| validated_genes.append({ | |
| 'gene_id': gene_id, | |
| 'dna_sequence': dna_sequence, | |
| 'protein_sequence': protein_sequence, | |
| 'cai_score': row.get('cai_score', None), | |
| 'is_high_cai': is_high_cai | |
| }) | |
| df_processed = pd.DataFrame(validated_genes) | |
| output_path = 'data/ecoli_processed_genes.csv' | |
| df_processed.to_csv(output_path, index=False) | |
| print(f"Processed data saved to {output_path}") | |
| print(f"Total validated genes: {len(df_processed)}") | |
| if __name__ == "__main__": | |
| main() | |