Spaces:

Jayesh13
/

HI_SCBL

Runtime error

App Files Files Community

Jayesh13 commited on Oct 11, 2024

Commit

82bea84

verified ·

1 Parent(s): a361d73

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -42

app.py CHANGED Viewed

@@ -8,8 +8,6 @@ import xlsxwriter
 from io import BytesIO
 from collections import defaultdict
 # Function to find repeated amino acids in the protein sequence
 def find_homorepeats(protein):
     n = len(protein)
@@ -29,31 +27,26 @@ def find_homorepeats(protein):
     return freq
-# Function to process a single CSV file and return its analysis
-def process_csv(file):
-    df = pd.read_csv(file)
-    if len(df.columns) < 3:
-        st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
-        return None
-    # Storing entry ID, protein name, and sequence
-    sequences = []
-    for _, row in df.iterrows():
-        entry_id = str(row[0])
-        protein_name = str(row[1])
-        sequence = str(row[2]).replace('"', '').replace(' ', '')
-        sequences.append((entry_id, protein_name, sequence))
-    # Analyzing homorepeats in the sequences
     homorepeats = set()
     sequence_data = []
-    for entry_id, protein_name, sequence in sequences:
-        freq = find_homorepeats(sequence)
-        homorepeats.update(freq.keys())  # Collect unique homorepeats
-        sequence_data.append((entry_id, protein_name, freq))
-    return homorepeats, sequence_data
 # Function to generate and download Excel workbook with separate sheets for each input file
 def create_excel(sequences_data, homorepeats, filenames):
@@ -88,23 +81,6 @@ def create_excel(sequences_data, homorepeats, filenames):
     output.seek(0)
     return output
-# Function to process the Excel file
-def process_excel(excel_data):
-    # Custom logic to process each sheet within the Excel file
-    homorepeats = set()
-    sequence_data = []
-    for sheet_name in excel_data.sheet_names:
-        df = excel_data.parse(sheet_name)
-        for index, row in df.iterrows():
-            entry_id = row['Entry ID']
-            protein_name = row['Protein Name']
-            freq = {repeat: row[repeat] for repeat in df.columns[2:]}  # Assuming repeats start from 3rd column
-            sequence_data.append((entry_id, protein_name, freq))
-            homorepeats.update(freq.keys())
-    return homorepeats, sequence_data
 # Streamlit UI components
 st.title("Protein Homorepeat Analysis")
@@ -119,7 +95,7 @@ if uploaded_files:
     for file in uploaded_files:
         excel_data = pd.ExcelFile(file)
-        homorepeats, sequence_data = process_excel(excel_data)  # Modify your process_csv function to process_excel
         if homorepeats is not None:
             all_homorepeats.update(homorepeats)
             all_sequences_data.append(sequence_data)
@@ -152,4 +128,3 @@ if uploaded_files:
             result_df = pd.DataFrame(rows)
             st.dataframe(result_df)

 from io import BytesIO
 from collections import defaultdict
 # Function to find repeated amino acids in the protein sequence
 def find_homorepeats(protein):
     n = len(protein)
     return freq
+# Function to process a single Excel sheet and return its analysis
+def process_excel(excel_data):
     homorepeats = set()
     sequence_data = []
+    for sheet_name in excel_data.sheet_names:
+        df = excel_data.parse(sheet_name)
+        if len(df.columns) < 3:
+            st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence")
+            return None, None
+        for _, row in df.iterrows():
+            entry_id = str(row[0])
+            protein_name = str(row[1])
+            sequence = str(row[2]).replace('"', '').replace(' ', '')
+            freq = find_homorepeats(sequence)
+            sequence_data.append((entry_id, protein_name, freq))
+            homorepeats.update(freq.keys())  # Collect unique homorepeats
+    return homorepeats, sequence_data
 # Function to generate and download Excel workbook with separate sheets for each input file
 def create_excel(sequences_data, homorepeats, filenames):
     output.seek(0)
     return output
 # Streamlit UI components
 st.title("Protein Homorepeat Analysis")
     for file in uploaded_files:
         excel_data = pd.ExcelFile(file)
+        homorepeats, sequence_data = process_excel(excel_data)
         if homorepeats is not None:
             all_homorepeats.update(homorepeats)
             all_sequences_data.append(sequence_data)
             result_df = pd.DataFrame(rows)
             st.dataframe(result_df)