Spaces:

Andolinism
/

protein-profile-viewer

Runtime error

App Files Files Community

Kaveh commited on May 7, 2025

Commit

1c7472d

unverified ·

1 Parent(s): 2b486f1

Update app.py

Browse files

Files changed (1) hide show

app.py +281 -241

app.py CHANGED Viewed

@@ -8,10 +8,8 @@ from PIL import Image
 import sys
 import traceback
 import json
-import pandas as pd # Keep import, might be useful later
 UNIPROT_API_URL = "https://rest.uniprot.org/uniprotkb/{accession}.json"
-UNIPROT_SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search"
 AMINO_ACID_NAMES = {
     'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine', 'D': 'Aspartic acid',
@@ -22,191 +20,256 @@ AMINO_ACID_NAMES = {
 }
 STANDARD_AMINO_ACIDS_ORDER = "ARNDCQEGHILKMFPSTWYV"
-# --- Helper Functions (Keep as is) ---
-def get_amino_acid_frequencies(sequence): # ... (same as before) ...
     if not sequence or sequence == "N/A": return None, "Sequence not available for analysis."
     cleaned_sequence = "".join(filter(lambda x: x in AMINO_ACID_NAMES, sequence.upper()))
-    if not cleaned_sequence: return None, "No valid amino acids found for counting."
-    counts = Counter(cleaned_sequence); frequencies = {aa: counts.get(aa, 0) for aa in STANDARD_AMINO_ACIDS_ORDER}
     return frequencies, None
-def plot_amino_acid_frequencies(frequencies): # ... (same as before) ...
     if not frequencies: return None
     ordered_keys = [key for key in STANDARD_AMINO_ACIDS_ORDER if key in frequencies]
-    labels = [f"{aa}: {AMINO_ACID_NAMES.get(aa, aa)}" for aa in ordered_keys]; values = [frequencies[aa] for aa in ordered_keys]
     fig, ax = plt.subplots(figsize=(12, 7)); ax.bar(labels, values, color='skyblue')
-    ax.set_xlabel("Amino Acid"); ax.set_ylabel("Frequency"); ax.set_title("AA Freq Plot")
     plt.xticks(rotation=75, ha="right", fontsize=8); plt.tight_layout()
-    buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
-def extract_sequence_features(uniprot_data): # ... (same as before) ...
-    features_of_interest_uppercase = {"DOMAIN": "blue", "MOTIF": "green", "ACTIVE_SITE": "red", "BINDING_SITE": "orange", "MOD_RES": "purple", "HELIX": "cyan", "STRAND": "magenta", "TURN": "gold"}
-    extracted = []
-    if "features" in uniprot_data and uniprot_data["features"]:
-        for item in uniprot_data["features"]:
-            type_raw = item.get("type"); loc_obj = item.get("location", {})
-            if not isinstance(type_raw, str): continue
-            type_norm = type_raw.strip().upper()
-            if type_norm in features_of_interest_uppercase:
                 try:
-                    b_str, e_str = None, None; s_node, e_node, p_node = loc_obj.get("start"), loc_obj.get("end"), loc_obj.get("position")
-                    if s_node and isinstance(s_node, dict) and "value" in s_node: b_str = str(s_node["value"])
-                    if e_node and isinstance(e_node, dict) and "value" in e_node: e_str = str(e_node["value"])
-                    if p_node and isinstance(p_node, dict) and "value" in p_node:
-                        p_str = str(p_node["value"]);
-                        if b_str is None: b_str = p_str
-                        if e_str is None: e_str = p_str
-                        if "start" not in loc_obj and "end" not in loc_obj: b_str, e_str = p_str, p_str
-                    if b_str is None or e_str is None: continue
-                    b_pos, e_pos = int(b_str), int(e_str)
-                    if b_pos > e_pos: continue
-                    extracted.append({"type": type_raw, "begin": b_pos, "end": e_pos, "description": item.get("description", type_raw), "color": features_of_interest_uppercase[type_norm]})
-                except: continue
-    return extracted
-def plot_sequence_features(sequence_length, features): # ... (same as before) ...
     if not features or sequence_length == 0: return None
     fig, ax = plt.subplots(figsize=(12, max(3, len(features) * 0.4) + 1.5))
-    ax.set_xlim(0, sequence_length); ax.set_xlabel("AA Position"); ax.set_yticks([])
-    ax.set_title("Sequence Features"); leg_h = {}
-    y_pos, leg_set, b_h = 0, set(), 0.8
-    for feat in sorted(features, key=lambda x: x["begin"]):
-        b, e, c = feat["begin"], feat["end"], feat["color"]; w = max(1, e - b + 1)
-        ax.barh(y_pos, w, height=b_h, left=b -1, color=c, edgecolor='black', alpha=0.7)
-        if feat['type'] not in leg_set: leg_h[feat['type']] = plt.Rectangle((0, 0), 1, 1, fc=c, alpha=0.7); leg_set.add(feat['type'])
-        y_pos += 1
-    if y_pos > 0: ax.set_ylim(-0.5, y_pos -1 + b_h/2 + 0.5)
     else: plt.close(fig); return None
-    if leg_h: ax.legend(leg_h.values(), leg_h.keys(), title="Feature Types", bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
     plt.tight_layout(rect=[0, 0, 0.83, 0.96])
-    buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
-def extract_interactions(uniprot_data): # ... (same as before) ...
-    interactions = []
     if "comments" in uniprot_data:
-        for c in uniprot_data["comments"]:
-            if c.get("commentType") == "INTERACTION" and "interactions" in c:
-                for i_entry in c["interactions"]:
-                    i1_acc = i_entry.get("interactantOne", {}).get("uniProtKBAccession")
-                    i2_acc = i_entry.get("interactantTwo", {}).get("uniProtKBAccession")
-                    i2_gene = i_entry.get("interactantTwo", {}).get("geneName")
-                    if i1_acc == uniprot_data.get("primaryAccession") and i2_acc:
-                        partner = f"{i2_gene} ({i2_acc})" if i2_gene else i2_acc
-                        interactions.append(f"- Interacts with: **{partner}**")
-    return "\n".join(sorted(interactions)) if interactions else "No interaction partners listed in comments."
-def extract_pathways(uniprot_data): # ... (same as before) ...
-    pathways = []; dbs = { "KEGG": "...", "Reactome": "..." } # URLs omitted
     if "uniProtKBCrossReferences" in uniprot_data:
         for xref in uniprot_data["uniProtKBCrossReferences"]:
-            db = xref.get("database"); pid = xref.get("id")
-            if db in dbs and pid:
-                desc = pid;
                 if "properties" in xref:
-                    for p in xref["properties"]:
-                        if p.get("key") in ["PathwayName", "Description"]: desc = f"{p.get('value')} ({pid})"; break
-                link = dbs[db] + pid; pathways.append(f"- [{desc}]({link}) ({db})")
-    return "\n".join(sorted(list(set(pathways)))) if pathways else "No KEGG/Reactome pathway info."
-def extract_disease_info(uniprot_data): # ... (same as before) ...
-    diseases = []
     if "comments" in uniprot_data:
-        for c in uniprot_data["comments"]:
-            if c.get("commentType") == "DISEASE" and "disease" in c:
-                d_entry = c["disease"]; d_name = d_entry.get("diseaseId", "?"); desc = d_entry.get("description", "N/A")
-                mim = None;
-                if "diseaseCrossReference" in d_entry and d_entry["diseaseCrossReference"].get("database") == "MIM": mim = d_entry["diseaseCrossReference"].get("id")
-                d_md = f"**{d_name}**" + (f" (MIM: [{mim}](https://omim.org/entry/{mim}))" if mim else "")
-                d_md += f"\n   - *Desc:* {desc}\n"; note_val = c.get("note", {}).get("texts", [{}])[0].get("value")
-                if note_val: d_md += f"   - *Note:* {note_val}\n"
-                diseases.append(d_md)
-    return "\n---\n".join(sorted(diseases)) if diseases else "No disease association info."
-def extract_publications(uniprot_data): # ... (same as before) ...
-    pubs = []
-    if "references" in uniprot_data:
-        for i, ref in enumerate(uniprot_data.get("references", [])):
-            cit = ref.get("citation", {}); title = cit.get("title", "N/A"); authors = ", ".join(cit.get("authors", ["N/A"]))
-            j = cit.get("journalName", ""); v = cit.get("volume", ""); f = cit.get("firstPage", ""); l = cit.get("lastPage", ""); d = cit.get("publicationDate", "")
-            pmid, doi = None, None
-            if "citationCrossReferences" in cit:
-                for xr in cit["citationCrossReferences"]:
-                    if xr.get("database") == "PubMed": pmid = xr.get("id")
-                    elif xr.get("database") == "DOI": doi = xr.get("id")
-            md = f"**{i + 1}. {title}**\n   - *{authors}*\n   - *{j}" + (f", {v}" if v else "") + (f":{f}" if f else "") + (f"-{l}" if l else "")
-            if d: md += f" ({d})"
-            md += "*\n"
-            if pmid: md += f"   - [PubMed {pmid}](https://pubmed.ncbi.nlm.nih.gov/{pmid}/)\n"
-            if doi: md += f"   - [DOI {doi}](https://doi.org/{doi})\n"
-            pubs.append(md)
-    return "\n---\n".join(pubs) if pubs else "No publication info found."
-def extract_cross_references(uniprot_data): # ... (same as before) ...
-    xrefs = []; dbs = {"Ensembl": "...", "GeneID": "...", "RefSeq": "...", "GO": "...", "InterPro": "...", "Pfam": "...", "PDB": "...", "KEGG": "...", "Reactome": "..."} # URLs omitted
-    grouped = {db: [] for db in dbs};
-    if "uniProtKBCrossReferences" in uniprot_data:
-        for xr in uniprot_data["uniProtKBCrossReferences"]:
-            db = xr.get("database"); xid = xr.get("id")
-            if db in dbs and xid:
-                url, txt = None, xid;
-                if db == "GO": txt = xr.get("properties", [{}])[0].get("value", xid) + f" ({xid})"
-                url = dbs[db] + xid
-                if url: link_md = f"[{txt}]({url})";
-                    if link_md not in grouped[db]: grouped[db].append(link_md)
-        for db, links in grouped.items():
-            if links: xrefs.append(f"**{db}:** " + ", ".join(sorted(list(set(links)))))
-    return "\n".join(xrefs) if xrefs else "No selected cross-references."
-# --- Search Function ---
-def search_uniprot_by_name(search_term, result_limit=5):
-    # ... (Implementation from previous version, returns status_md, dataset_data) ...
-    if not search_term or len(search_term) < 3: return "Enter at least 3 characters.", []
-    params = { "query": f'({search_term}) AND (reviewed:true)', "fields": "accession,id,protein_name,organism_name", "format": "json", "size": result_limit }
-    status_md = f"### Search Results for '{search_term}':\n"; dataset_data = []
-    try:
-        response = requests.get(UNIPROT_SEARCH_URL, params=params); response.raise_for_status(); data = response.json()
-        results = data.get("results")
-        if not results: status_md += "No reviewed entries found."
-        else:
-            status_md += f"*Found {len(results)}. Select a row below, then copy the ID.*\n---"
-            for entry in results:
-                acc = entry.get("primaryAccession", "N/A"); uid = entry.get("uniProtkbId", ""); name = "N/A"
-                if entry.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value"): name = entry["proteinDescription"]["recommendedName"]["fullName"]["value"]
-                elif entry.get("proteinDescription", {}).get("submissionNames"): name = entry["proteinDescription"]["submissionNames"][0].get("fullName",{}).get("value", "N/A")
-                org = entry.get("organism", {}).get("scientificName", "N/A"); dataset_data.append([acc, uid, name, org])
-            status_md += "\n---"
-    except Exception as e: status_md += f"\n**Search Error:** {e}"
-    return status_md, dataset_data
-# --- Function to update copy box (Corrected) ---
-def update_copy_box(results_data, evt: gr.SelectData):
-    """Updates the textbox with the Accession ID from the selected row."""
-    if evt.index is None or results_data is None:
-        return "" # No selection or no data
-    row_index = evt.index[0] # Get the row index of the selection
-    # Ensure the row index is valid for the current dataset results
-    if 0 <= row_index < len(results_data):
-        selected_row = results_data[row_index]
-        accession_id = selected_row[0] # Accession ID is the first element
-        return accession_id
-    return "" # Return empty if index is out of bounds
-# --- Main Function to Get All Info ---
 def get_protein_info(uniprot_id):
-    # ... (Implementation from v1.5.1, returns 9 outputs) ...
-    empty_plot = gr.update(value=None, visible=False); empty_str = ""; err_msg = "Error"
-    outputs_on_error = (err_msg, empty_plot, empty_plot, empty_str, empty_str, empty_str, empty_str, empty_str, empty_str)
-    if not uniprot_id: return ("Enter UniProt ID.",) + outputs_on_error[1:]
     url = UNIPROT_API_URL.format(accession=uniprot_id.strip().upper())
     try:
         response = requests.get(url); response.raise_for_status(); data = response.json()
         acc = data.get("primaryAccession", "N/A"); id_display = data.get("uniProtkbId", "N/A")
-        link = f"https://www.uniprot.org/uniprotkb/{acc}/entry"; name_dict = data.get("proteinDescription", {}).get("recommendedName", {}); name = name_dict.get("fullName", {}).get("value", "N/A")
         if name == "N/A" and data.get("proteinDescription", {}).get("submissionNames"): name = data["proteinDescription"]["submissionNames"][0].get("fullName", {}).get("value", "N/A")
         genes_data = data.get("genes"); gene_str = "N/A"
         if genes_data:
             g_list = [g.get("geneName", {}).get("value", "") for g in genes_data if g.get("geneName")]
             if not g_list: g_list = [g.get("orfNames", [{}])[0].get("value", "") for g in genes_data if g.get("orfNames")]
             gene_str = ", ".join(filter(None, g_list)) or "N/A"
-        org_data = data.get("organism", {}); org_sci_name = org_data.get("scientificName", "N/A"); org_common_name = org_data.get("commonName", "")
-        org_display = f"{org_sci_name}" + (f" ({org_common_name})" if org_common_name else ""); seq_info = data.get("sequence", {}); seq_val = seq_info.get("value", "N/A"); length = seq_info.get("length", 0)
-        status_val = data.get("entryAudit", {}).get("entryType", "N/A").replace("UniProtKB ", ""); existence_val = data.get("proteinExistence", "N/A").replace(": Evidence at ", ": ")
-        score_val = data.get("annotationScore", "N/A"); mw_str = "N/A"
         if seq_val != "N/A" and length > 0:
             try:
                 clean_seq = "".join(filter(lambda x: x in AMINO_ACID_NAMES, seq_val.upper()))
@@ -214,114 +277,91 @@ def get_protein_info(uniprot_id):
                 else: mw_str = "Invalid sequence for MW"
             except: mw_str = "Error in MW calc"
         comments_data = data.get("comments", []); func_comment = "N/A"
-        for c_item in comments_data:
-            if c_item.get("commentType") == "FUNCTION":
-                texts = c_item.get("texts", [])
-                if texts:
-                    func_comment = texts[0].get("value", "N/A")
-                    break
-        overview_md = (f"## {id_display} ({acc})\n[{acc} on UniProt]({link})\n\n**Protein:** {name}\n**Gene:** {gene_str}\n**Status:** {status_val}\n"
-                       f"**Organism:** {org_display}\n**Length:** {length} aa\n**Existence:** {existence_val}\n**Score:** {score_val}/5\n**Calc. MW:** {mw_str}\n\n"
-                       f"**Function Snippet:**\n{func_comment}\n\n**Sequence (first 100 aa):**\n`{seq_val[:100]}{'...' if len(seq_val) > 100 else ''}`\n\n--- \n*More details in other tabs.*")
-        interactions_md = extract_interactions(data); pathways_md = extract_pathways(data)
-        disease_md = extract_disease_info(data); publications_md = extract_publications(data); xref_md = extract_cross_references(data)
-        aa_freq, aa_err = get_amino_acid_frequencies(seq_val); aa_plot_upd = empty_plot
         if aa_err: overview_md += f"\n\n**AA Freq Error:** {aa_err}"
-        elif aa_freq: img_aa = plot_amino_acid_frequencies(aa_freq);
             if img_aa: aa_plot_upd = gr.update(value=img_aa, visible=True)
-        seq_feat = extract_sequence_features(data); feat_plot_upd = empty_plot; feat_msg = ""
         if seq_feat and length > 0:
             img_feat = plot_sequence_features(length, seq_feat)
             if img_feat: feat_plot_upd = gr.update(value=img_feat, visible=True)
-            else: feat_msg = "Could not generate feature plot."
         elif not seq_feat and length > 0 : feat_msg = "No relevant features found for plotting."
-        return (overview_md, aa_plot_upd, feat_plot_upd, feat_msg, pathways_md, interactions_md, disease_md, publications_md, xref_md)
     except requests.exceptions.HTTPError as e:
         err_msg_http = f"Error: ID '{uniprot_id}' not found." if e.response.status_code == 404 else f"HTTP error: {e}"
         return (err_msg_http,) + outputs_on_error[1:]
-    except Exception as e: return (f"Error: {str(e)[:150]}",) + outputs_on_error[1:]
-# --- Gradio UI Definition ---
 with gr.Blocks(theme=gr.themes.Glass()) as iface:
-    gr.Markdown("# Protein Profile Viewer (v1.6.2 - Select Fix)") # Version updated
-    gr.Markdown("Enter a UniProt ID directly, **OR** search for a protein/gene name below to find and copy its ID.")
-    with gr.Group():
-        gr.Markdown("### Find UniProt ID by Name/Keyword")
-        search_term_input = gr.Textbox(label="Search Term (min 3 chars, type and wait)", placeholder="e.g., insulin, EGFR, P53")
-        search_status_output = gr.Markdown()
-        # Dataset to store search results data (invisible, used as state)
-        search_results_data_state = gr.State([])
-        # Dataset component for display
-        search_results_output_display = gr.Dataset(
-            label="Search Results (Select a row to copy Accession)",
-            headers=["Accession", "UniProtKB ID", "Protein Name", "Organism"],
-            samples=[], samples_per_page=5
-        )
-        selected_id_to_copy = gr.Textbox(label="Copy this Accession ID:", interactive=True, show_copy_button=True)
-    gr.Markdown("---")
-    gr.Markdown("### View Protein Profile")
     with gr.Row():
-        protein_id_input = gr.Textbox(label="Enter UniProt Accession ID", placeholder="Paste ID here or enter directly", scale=3)
-        submit_button = gr.Button("Get Profile", scale=1, variant="primary")
     with gr.Tabs():
-        # ... (Tabs definition remains the same) ...
         with gr.TabItem("Overview"):
-             gr.Markdown("### Protein Overview\nKey information...")
-             overview_output = gr.Markdown()
         with gr.TabItem("Analysis Plots"):
-             gr.Markdown("### Sequence Analysis Visualizations\nAmino acid composition and annotated features.")
-             with gr.Column():
-                 aa_freq_plot_output = gr.Image(label="Amino Acid Frequency Plot", type="pil", show_label=True, visible=False)
-                 seq_features_plot_output = gr.Image(label="Sequence Features Plot", type="pil", show_label=True, visible=False)
-                 seq_features_message_output = gr.Markdown()
         with gr.TabItem("Functional Context"):
-             gr.Markdown("### Pathways, Interactions & Disease\nBiological context.")
-             with gr.Accordion("Biological Pathways", open=False): pathways_output = gr.Markdown()
-             with gr.Accordion("Protein Interactions", open=False): interactions_output = gr.Markdown()
-             with gr.Accordion("Disease Associations", open=False): disease_output = gr.Markdown()
         with gr.TabItem("Publications"):
-             gr.Markdown("### Relevant Publications\nAssociated scientific literature.")
-             publications_output = gr.Markdown()
-        with gr.TabItem("Cross-references"):
-             gr.Markdown("### Database Links\nLinks to other relevant databases.")
-             xref_output = gr.Markdown()
-    # --- Event Handlers (Corrected) ---
-    search_term_input.change(
-        fn=search_uniprot_by_name,
-        inputs=search_term_input,
-        # Output both the status message and the data for the Dataset state
-        outputs=[search_status_output, search_results_data_state]
-    )
-    # When the state data changes, update the displayed Dataset
-    search_results_data_state.change(
-        fn=lambda data: data, # Simple function to pass data through
-        inputs=search_results_data_state,
-        outputs=search_results_output_display
-    )
-    # When a row is selected in the displayed Dataset, update the copy box
-    search_results_output_display.select(
-        fn=update_copy_box,
-        # Pass the *state* containing the data and the event data
-        inputs=[search_results_data_state],
-        outputs=selected_id_to_copy
-        # Removed the incorrect _js argument
-    )
-    # Main profile submit button action
     submit_button.click(
-        fn=get_protein_info, inputs=protein_id_input,
         outputs=[overview_output, aa_freq_plot_output, seq_features_plot_output, seq_features_message_output,
                  pathways_output, interactions_output, disease_output, publications_output, xref_output]
     )
-    gr.Examples(examples=[["P05067"], ["P00533"], ["Q9BYF1"], ["P0DP23"], ["P04637"]], inputs=protein_id_input)
 if __name__ == "__main__":
     iface.launch()

 import sys
 import traceback
 import json
 UNIPROT_API_URL = "https://rest.uniprot.org/uniprotkb/{accession}.json"
 AMINO_ACID_NAMES = {
     'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine', 'D': 'Aspartic acid',
 }
 STANDARD_AMINO_ACIDS_ORDER = "ARNDCQEGHILKMFPSTWYV"
+def get_amino_acid_frequencies(sequence):
     if not sequence or sequence == "N/A": return None, "Sequence not available for analysis."
     cleaned_sequence = "".join(filter(lambda x: x in AMINO_ACID_NAMES, sequence.upper()))
+    if not cleaned_sequence: return None, "No valid amino acids found in sequence for counting."
+    counts = Counter(cleaned_sequence)
+    frequencies = {aa: counts.get(aa, 0) for aa in STANDARD_AMINO_ACIDS_ORDER}
     return frequencies, None
+def plot_amino_acid_frequencies(frequencies):
     if not frequencies: return None
     ordered_keys = [key for key in STANDARD_AMINO_ACIDS_ORDER if key in frequencies]
+    labels = [f"{aa}: {AMINO_ACID_NAMES.get(aa, aa)}" for aa in ordered_keys]
+    values = [frequencies[aa] for aa in ordered_keys]
     fig, ax = plt.subplots(figsize=(12, 7)); ax.bar(labels, values, color='skyblue')
+    ax.set_xlabel("Amino Acid"); ax.set_ylabel("Frequency"); ax.set_title("Amino Acid Frequency Plot")
     plt.xticks(rotation=75, ha="right", fontsize=8); plt.tight_layout()
+    buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
+    img = Image.open(buf); plt.close(fig)
+    return img
+def extract_sequence_features(uniprot_data):
+    features_of_interest_uppercase = {
+        "DOMAIN": "blue", "MOTIF": "green", "ACTIVE_SITE": "red",
+        "BINDING_SITE": "orange", "MOD_RES": "purple",
+        "HELIX": "cyan", "STRAND": "magenta", "TURN": "gold"
+    }
+    extracted_features = []
+    if "features" in uniprot_data and uniprot_data["features"] is not None:
+        for feature_item in uniprot_data["features"]:
+            feature_type_raw = feature_item.get("type")
+            if not isinstance(feature_type_raw, str): continue
+            feature_type_normalized = feature_type_raw.strip().upper()
+            if feature_type_normalized in features_of_interest_uppercase:
                 try:
+                    location_obj = feature_item.get("location", {})
+                    begin_pos_val_str = None; end_pos_val_str = None
+                    start_node = location_obj.get("start"); end_node = location_obj.get("end")
+                    position_node = location_obj.get("position")
+                    if start_node and isinstance(start_node, dict) and "value" in start_node: begin_pos_val_str = str(start_node["value"])
+                    if end_node and isinstance(end_node, dict) and "value" in end_node: end_pos_val_str = str(end_node["value"])
+                    if position_node and isinstance(position_node, dict) and "value" in position_node:
+                        pos_val_str = str(position_node["value"])
+                        if begin_pos_val_str is None: begin_pos_val_str = pos_val_str
+                        if end_pos_val_str is None: end_pos_val_str = pos_val_str
+                        if "start" not in location_obj and "end" not in location_obj:
+                             begin_pos_val_str = pos_val_str; end_pos_val_str = pos_val_str
+                    if begin_pos_val_str is None or end_pos_val_str is None: continue
+                    begin_pos = int(begin_pos_val_str); end_pos = int(end_pos_val_str)
+                    if begin_pos > end_pos: continue
+                    extracted_features.append({
+                        "type": feature_type_raw, "begin": begin_pos, "end": end_pos,
+                        "description": feature_item.get("description", feature_type_raw),
+                        "color": features_of_interest_uppercase[feature_type_normalized]
+                    })
+                except (ValueError, TypeError, AttributeError): continue
+    return extracted_features
+def plot_sequence_features(sequence_length, features):
     if not features or sequence_length == 0: return None
     fig, ax = plt.subplots(figsize=(12, max(3, len(features) * 0.4) + 1.5))
+    ax.set_xlim(0, sequence_length); ax.set_xlabel("Amino Acid Position"); ax.set_yticks([])
+    ax.set_title("Sequence Features Plot"); legend_handles = {}
+    y_pos_counter = 0; plotted_feature_types_in_legend = set(); bar_height = 0.8
+    for feature in sorted(features, key=lambda x: x["begin"]):
+        begin = feature["begin"]; end = feature["end"]; color = feature["color"]
+        width = max(1, end - begin + 1)
+        ax.barh(y_pos_counter, width, height=bar_height, left=begin -1, color=color, edgecolor='black', alpha=0.7)
+        if feature['type'] not in plotted_feature_types_in_legend:
+            legend_handles[feature['type']] = plt.Rectangle((0, 0), 1, 1, fc=color, alpha=0.7)
+            plotted_feature_types_in_legend.add(feature['type'])
+        y_pos_counter += 1
+    if y_pos_counter > 0: ax.set_ylim(-0.5, y_pos_counter -1 + bar_height/2 + 0.5)
     else: plt.close(fig); return None
+    if legend_handles: ax.legend(legend_handles.values(), legend_handles.keys(), title="Feature Types", bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
     plt.tight_layout(rect=[0, 0, 0.83, 0.96])
+    buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
+    img = Image.open(buf); plt.close(fig)
+    return img
+def extract_interactions(uniprot_data):
+    interactions_list = []
     if "comments" in uniprot_data:
+        for comment in uniprot_data["comments"]:
+            if comment.get("commentType") == "INTERACTION" and "interactions" in comment:
+                for interaction_entry in comment["interactions"]:
+                    interactant_one_acc = interaction_entry.get("interactantOne", {}).get("uniProtKBAccession")
+                    interactant_two_acc = interaction_entry.get("interactantTwo", {}).get("uniProtKBAccession")
+                    interactant_two_gene = interaction_entry.get("interactantTwo", {}).get("geneName")
+                    if interactant_one_acc == uniprot_data.get("primaryAccession") and interactant_two_acc:
+                        partner_display_name = interactant_two_acc
+                        if interactant_two_gene:
+                            partner_display_name = f"{interactant_two_gene} ({interactant_two_acc})"
+                        interactions_list.append(f"- Interacts with: **{partner_display_name}**")
+    if not interactions_list:
+        return "No specific interaction partners listed in UniProt comments."
+    # Sort alphabetically for consistency
+    return "\n".join(sorted(interactions_list))
+def extract_pathways(uniprot_data):
+    pathways = []
+    pathway_databases = { "KEGG": "https://www.genome.jp/dbget-bin/www_bget?", "Reactome": "https://reactome.org/content/detail/" }
     if "uniProtKBCrossReferences" in uniprot_data:
         for xref in uniprot_data["uniProtKBCrossReferences"]:
+            db_name = xref.get("database")
+            if db_name in pathway_databases:
+                pathway_id = xref.get("id"); pathway_description = ""
                 if "properties" in xref:
+                    for prop in xref["properties"]:
+                        if prop.get("key") == "PathwayName" or prop.get("key") == "Description":
+                            pathway_description = prop.get("value"); break
+                if pathway_id:
+                    link = pathway_databases[db_name] + pathway_id
+                    display_text = f"{pathway_description} ({pathway_id})" if pathway_description else pathway_id
+                    pathways.append(f"- [{display_text}]({link}) ({db_name})")
+    if not pathways: return "No pathway information found in KEGG or Reactome cross-references."
+    return "\n".join(sorted(list(set(pathways))))
+def extract_disease_info(uniprot_data):
+    disease_info_list = []
     if "comments" in uniprot_data:
+        for comment in uniprot_data["comments"]:
+            if comment.get("commentType") == "DISEASE" and "disease" in comment:
+                disease_entry = comment["disease"]
+                disease_name = disease_entry.get("diseaseId", "Unknown disease")
+                description = disease_entry.get("description", "No description available.")
+                mim_id = None
+                if "diseaseCrossReference" in disease_entry and disease_entry["diseaseCrossReference"].get("database") == "MIM":
+                    mim_id = disease_entry["diseaseCrossReference"].get("id")
+                disease_md = f"**{disease_name}**"
+                if mim_id: disease_md += f" (MIM: [{mim_id}](https://www.omim.org/entry/{mim_id}))"
+                disease_md += f"\n   - *Description:* {description}\n"
+                if "note" in comment and "texts" in comment["note"]:
+                    for note_text_obj in comment["note"]["texts"]:
+                        note_val = note_text_obj.get("value")
+                        if note_val: disease_md += f"   - *Note:* {note_val}\n"
+                disease_info_list.append(disease_md)
+    if not disease_info_list: return "No specific disease association information found in UniProt comments."
+    return "\n---\n".join(sorted(disease_info_list)) # Sort alphabetically by disease name
+def extract_publications(uniprot_data):
+    publications_list = []
+    if "references" in uniprot_data:
+        for ref_idx, ref in enumerate(uniprot_data.get("references", [])):
+            citation = ref.get("citation", {})
+            title = citation.get("title", "N/A")
+            authors = ", ".join(citation.get("authors", ["N/A"]))
+            journal = citation.get("journalName", "N/A")
+            volume = citation.get("volume", "")
+            first_page = citation.get("firstPage", "")
+            last_page = citation.get("lastPage", "")
+            publication_date = citation.get("publicationDate", "")
+            pubmed_id = None; doi_id = None
+            if "citationCrossReferences" in citation:
+                for xref_cite in citation["citationCrossReferences"]:
+                    if xref_cite.get("database") == "PubMed": pubmed_id = xref_cite.get("id")
+                    elif xref_cite.get("database") == "DOI": doi_id = xref_cite.get("id")
+            pub_md = f"**{ref_idx + 1}. Title:** {title}\n"
+            pub_md += f"   - *Authors:* {authors}\n"
+            pub_md += f"   - *Journal:* {journal}"
+            if volume: pub_md += f", Vol. {volume}"
+            if first_page: pub_md += f", pp. {first_page}"
+            if last_page: pub_md += f"-{last_page}"
+            if publication_date: pub_md += f" ({publication_date})"
+            pub_md += "\n"
+            if pubmed_id: pub_md += f"   - *PubMed:* [{pubmed_id}](https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}/)\n"
+            if doi_id: pub_md += f"   - *DOI:* [{doi_id}](https://doi.org/{doi_id})\n"
+            publications_list.append(pub_md)
+    if not publications_list: return "No publication information found in this UniProt entry."
+    return "\n---\n".join(publications_list)
+def extract_cross_references(uniprot_data):
+    xref_list = []
+    target_databases = {
+        "Ensembl": "https://www.ensembl.org/id/", "GeneID": "https://www.ncbi.nlm.nih.gov/gene/",
+        "RefSeq": "https://www.ncbi.nlm.nih.gov/nuccore/", "GO": "https://amigo.geneontology.org/amigo/term/",
+        "InterPro": "https://www.ebi.ac.uk/interpro/entry/InterPro/",
+        "Pfam": "https://www.ebi.ac.uk/interpro/entry/pfam/",
+        "PDB": "https://www.rcsb.org/structure/",
+        "KEGG": "https://www.genome.jp/dbget-bin/www_bget?", # Add KEGG gene link
+        "Reactome": "https://reactome.org/content/detail/" # Reactome protein link (usually same as pathway)
+    }
+    grouped_xrefs = {db: [] for db in target_databases}
+    if "uniProtKBCrossReferences" in uniprot_data:
+        for xref in uniprot_data["uniProtKBCrossReferences"]:
+            db_name = xref.get("database")
+            if db_name in target_databases:
+                xref_id = xref.get("id"); link_url = None; display_text = xref_id
+                if db_name == "Ensembl" and "properties" in xref:
+                    for prop in xref["properties"]:
+                        if prop.get("key") == "GeneId":
+                            ensembl_gene_id = prop.get("value")
+                            link_url = f"https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={ensembl_gene_id}"
+                            display_text = ensembl_gene_id; break
+                    if link_url is None:
+                        for prop in xref["properties"]:
+                           if prop.get("key") == "ProteinId":
+                                ensembl_prot_id = prop.get("value")
+                                link_url = f"https://www.ensembl.org/Homo_sapiens/Transcript/Summary?p={ensembl_prot_id}"
+                                display_text = ensembl_prot_id; break
+                elif db_name == "RefSeq" and "properties" in xref:
+                     for prop in xref["properties"]:
+                         if prop.get("key") == "ProteinId" or prop.get("key") == "NucleotideSequenceId":
+                             refseq_id = prop.get("value")
+                             link_url = target_databases[db_name] + refseq_id
+                             display_text = refseq_id; break
+                elif db_name == "GO" and xref_id:
+                    term_name = xref_id
+                    if "properties" in xref:
+                        for prop in xref["properties"]:
+                             if prop.get("key") == "GoTerm": term_name = prop.get("value"); break
+                    link_url = target_databases[db_name] + xref_id; display_text = f"{term_name} ({xref_id})"
+                elif db_name == "Pfam" and xref_id:
+                    link_url = f"https://www.ebi.ac.uk/interpro/entry/pfam/{xref_id}"
+                elif xref_id: link_url = target_databases[db_name] + xref_id
+                if link_url: grouped_xrefs[db_name].append(f"[{display_text}]({link_url})")
+        for db_name, links in grouped_xrefs.items():
+            if links: xref_list.append(f"**{db_name}:** " + ", ".join(sorted(list(set(links)))))
+    if not xref_list: return "No cross-references found for the selected databases."
+    return "\n".join(xref_list)
 def get_protein_info(uniprot_id):
+    empty_plot = gr.update(value=None, visible=False); empty_str = ""; err_msg_default = "Error."
+    outputs_on_error = (err_msg_default, empty_plot, empty_plot, empty_str,
+                        empty_str, empty_str, empty_str, empty_str, empty_str)
+    if not uniprot_id:
+        return ("Please enter a UniProt ID.",) + outputs_on_error[1:]
     url = UNIPROT_API_URL.format(accession=uniprot_id.strip().upper())
     try:
         response = requests.get(url); response.raise_for_status(); data = response.json()
         acc = data.get("primaryAccession", "N/A"); id_display = data.get("uniProtkbId", "N/A")
+        uniprot_link = f"https://www.uniprot.org/uniprotkb/{acc}/entry"
+        name_dict = data.get("proteinDescription", {}).get("recommendedName", {}); name = name_dict.get("fullName", {}).get("value", "N/A")
         if name == "N/A" and data.get("proteinDescription", {}).get("submissionNames"): name = data["proteinDescription"]["submissionNames"][0].get("fullName", {}).get("value", "N/A")
         genes_data = data.get("genes"); gene_str = "N/A"
         if genes_data:
             g_list = [g.get("geneName", {}).get("value", "") for g in genes_data if g.get("geneName")]
             if not g_list: g_list = [g.get("orfNames", [{}])[0].get("value", "") for g in genes_data if g.get("orfNames")]
             gene_str = ", ".join(filter(None, g_list)) or "N/A"
+        org_data = data.get("organism", {}); org_sci_name = org_data.get("scientificName", "N/A")
+        org_common_name = org_data.get("commonName", "")
+        org_display = f"{org_sci_name}" + (f" ({org_common_name})" if org_common_name else "")
+        seq_info = data.get("sequence", {}); seq_val = seq_info.get("value", "N/A"); length = seq_info.get("length", 0)
+        status_val = data.get("entryAudit", {}).get("entryType", "N/A").replace("UniProtKB ", "")
+        existence_val = data.get("proteinExistence", "N/A").replace(": Evidence at ", ": ")
+        score_val = data.get("annotationScore", "N/A")
+        mw_str = "N/A"
         if seq_val != "N/A" and length > 0:
             try:
                 clean_seq = "".join(filter(lambda x: x in AMINO_ACID_NAMES, seq_val.upper()))
                 else: mw_str = "Invalid sequence for MW"
             except: mw_str = "Error in MW calc"
         comments_data = data.get("comments", []); func_comment = "N/A"
+        for c_item in comments_data:
+            if c_item.get("commentType") == "FUNCTION":
+                texts = c_item.get("texts", [])
+                if texts: func_comment = texts[0].get("value", "N/A"); break
+        overview_md = (f"## {id_display} ({acc})\n"
+                       f"[{acc} on UniProt]({uniprot_link})\n\n"
+                       f"**Protein:** {name}\n**Gene:** {gene_str}\n**Status:** {status_val}\n"
+                       f"**Organism:** {org_display}\n**Length:** {length} aa\n"
+                       f"**Existence:** {existence_val}\n**Score:** {score_val}/5\n**Calc. MW:** {mw_str}\n\n"
+                       f"**Function Snippet:**\n{func_comment}\n\n"
+                       f"**Sequence (first 100 aa):**\n`{seq_val[:100]}{'...' if len(seq_val) > 100 else ''}`\n\n"
+                       f"--- \n*More details in other tabs.*")
+        interactions_md = extract_interactions(data)
+        pathways_md = extract_pathways(data)
+        disease_md = extract_disease_info(data)
+        publications_md = extract_publications(data)
+        xref_md = extract_cross_references(data)
+        aa_freq, aa_err = get_amino_acid_frequencies(seq_val)
+        aa_plot_upd = empty_plot
         if aa_err: overview_md += f"\n\n**AA Freq Error:** {aa_err}"
+        elif aa_freq:
+            img_aa = plot_amino_acid_frequencies(aa_freq)
             if img_aa: aa_plot_upd = gr.update(value=img_aa, visible=True)
+        seq_feat = extract_sequence_features(data)
+        feat_plot_upd = empty_plot; feat_msg = ""
         if seq_feat and length > 0:
             img_feat = plot_sequence_features(length, seq_feat)
             if img_feat: feat_plot_upd = gr.update(value=img_feat, visible=True)
+            else: feat_msg = "Could not generate sequence feature plot."
         elif not seq_feat and length > 0 : feat_msg = "No relevant features found for plotting."
+        return (overview_md, aa_plot_upd, feat_plot_upd, feat_msg,
+                pathways_md, interactions_md, disease_md, publications_md, xref_md)
     except requests.exceptions.HTTPError as e:
         err_msg_http = f"Error: ID '{uniprot_id}' not found." if e.response.status_code == 404 else f"HTTP error: {e}"
         return (err_msg_http,) + outputs_on_error[1:]
+    except Exception as e:
+        return (f"Error: {str(e)[:150]}",) + outputs_on_error[1:]
 with gr.Blocks(theme=gr.themes.Glass()) as iface:
+    gr.Markdown("# Protein Profile Viewer (v1.4 - Cross-references)")
+    gr.Markdown("Enter a UniProt ID to explore its details including overview, sequence analysis, functional context, publications, and links to other databases.")
     with gr.Row():
+        protein_id_input = gr.Textbox(label="Enter UniProt ID", placeholder="e.g., P00533", scale=3)
+        submit_button = gr.Button("Submit", scale=1, variant="primary")
     with gr.Tabs():
         with gr.TabItem("Overview"):
+            gr.Markdown("### Protein Overview\nKey information about the protein, including its UniProt ID, name, gene, organism, length, function snippet, and a link to the full UniProt entry. The first 100 amino acids of the sequence are also displayed here.")
+            overview_output = gr.Markdown()
         with gr.TabItem("Analysis Plots"):
+            gr.Markdown("### Sequence Analysis Visualizations\nGraphical representations of amino acid composition and annotated sequence features.")
+            with gr.Column():
+                aa_freq_plot_output = gr.Image(label="Amino Acid Frequency Plot", type="pil", show_label=True, visible=False)
+                seq_features_plot_output = gr.Image(label="Sequence Features Plot", type="pil", show_label=True, visible=False)
+                seq_features_message_output = gr.Markdown()
         with gr.TabItem("Functional Context"):
+            gr.Markdown("### Pathways, Interactions & Disease\nBiological context: pathways, interaction partners, and associated diseases.")
+            with gr.Accordion("Biological Pathways (KEGG, Reactome)", open=False):
+                 pathways_output = gr.Markdown()
+            with gr.Accordion("Protein Interactions", open=False):
+                 interactions_output = gr.Markdown()
+            with gr.Accordion("Disease Associations", open=False):
+                 disease_output = gr.Markdown()
         with gr.TabItem("Publications"):
+            gr.Markdown("### Relevant Publications\nA list of scientific publications from UniProt.")
+            publications_output = gr.Markdown()
+        with gr.TabItem("Cross-references"): # New Tab
+             gr.Markdown("### Database Links\nLinks to this protein's entry in other relevant biological databases (e.g., Ensembl, RefSeq, GO, PDB).")
+             xref_output = gr.Markdown() # New output component
     submit_button.click(
+        fn=get_protein_info,
+        inputs=protein_id_input,
         outputs=[overview_output, aa_freq_plot_output, seq_features_plot_output, seq_features_message_output,
                  pathways_output, interactions_output, disease_output, publications_output, xref_output]
     )
+    gr.Examples(
+        examples=[["P05067"], ["P00533"], ["Q9BYF1"], ["P0DP23"], ["P04637"]],
+        inputs=protein_id_input
+    )
 if __name__ == "__main__":
     iface.launch()