Kaveh commited on
Commit
1c7472d
·
unverified ·
1 Parent(s): 2b486f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -241
app.py CHANGED
@@ -8,10 +8,8 @@ from PIL import Image
8
  import sys
9
  import traceback
10
  import json
11
- import pandas as pd # Keep import, might be useful later
12
 
13
  UNIPROT_API_URL = "https://rest.uniprot.org/uniprotkb/{accession}.json"
14
- UNIPROT_SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search"
15
 
16
  AMINO_ACID_NAMES = {
17
  'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine', 'D': 'Aspartic acid',
@@ -22,191 +20,256 @@ AMINO_ACID_NAMES = {
22
  }
23
  STANDARD_AMINO_ACIDS_ORDER = "ARNDCQEGHILKMFPSTWYV"
24
 
25
- # --- Helper Functions (Keep as is) ---
26
- def get_amino_acid_frequencies(sequence): # ... (same as before) ...
27
  if not sequence or sequence == "N/A": return None, "Sequence not available for analysis."
28
  cleaned_sequence = "".join(filter(lambda x: x in AMINO_ACID_NAMES, sequence.upper()))
29
- if not cleaned_sequence: return None, "No valid amino acids found for counting."
30
- counts = Counter(cleaned_sequence); frequencies = {aa: counts.get(aa, 0) for aa in STANDARD_AMINO_ACIDS_ORDER}
 
31
  return frequencies, None
32
- def plot_amino_acid_frequencies(frequencies): # ... (same as before) ...
 
33
  if not frequencies: return None
34
  ordered_keys = [key for key in STANDARD_AMINO_ACIDS_ORDER if key in frequencies]
35
- labels = [f"{aa}: {AMINO_ACID_NAMES.get(aa, aa)}" for aa in ordered_keys]; values = [frequencies[aa] for aa in ordered_keys]
 
36
  fig, ax = plt.subplots(figsize=(12, 7)); ax.bar(labels, values, color='skyblue')
37
- ax.set_xlabel("Amino Acid"); ax.set_ylabel("Frequency"); ax.set_title("AA Freq Plot")
38
  plt.xticks(rotation=75, ha="right", fontsize=8); plt.tight_layout()
39
- buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
40
- def extract_sequence_features(uniprot_data): # ... (same as before) ...
41
- features_of_interest_uppercase = {"DOMAIN": "blue", "MOTIF": "green", "ACTIVE_SITE": "red", "BINDING_SITE": "orange", "MOD_RES": "purple", "HELIX": "cyan", "STRAND": "magenta", "TURN": "gold"}
42
- extracted = []
43
- if "features" in uniprot_data and uniprot_data["features"]:
44
- for item in uniprot_data["features"]:
45
- type_raw = item.get("type"); loc_obj = item.get("location", {})
46
- if not isinstance(type_raw, str): continue
47
- type_norm = type_raw.strip().upper()
48
- if type_norm in features_of_interest_uppercase:
 
 
 
 
 
 
 
49
  try:
50
- b_str, e_str = None, None; s_node, e_node, p_node = loc_obj.get("start"), loc_obj.get("end"), loc_obj.get("position")
51
- if s_node and isinstance(s_node, dict) and "value" in s_node: b_str = str(s_node["value"])
52
- if e_node and isinstance(e_node, dict) and "value" in e_node: e_str = str(e_node["value"])
53
- if p_node and isinstance(p_node, dict) and "value" in p_node:
54
- p_str = str(p_node["value"]);
55
- if b_str is None: b_str = p_str
56
- if e_str is None: e_str = p_str
57
- if "start" not in loc_obj and "end" not in loc_obj: b_str, e_str = p_str, p_str
58
- if b_str is None or e_str is None: continue
59
- b_pos, e_pos = int(b_str), int(e_str)
60
- if b_pos > e_pos: continue
61
- extracted.append({"type": type_raw, "begin": b_pos, "end": e_pos, "description": item.get("description", type_raw), "color": features_of_interest_uppercase[type_norm]})
62
- except: continue
63
- return extracted
64
- def plot_sequence_features(sequence_length, features): # ... (same as before) ...
 
 
 
 
 
 
 
 
 
65
  if not features or sequence_length == 0: return None
66
  fig, ax = plt.subplots(figsize=(12, max(3, len(features) * 0.4) + 1.5))
67
- ax.set_xlim(0, sequence_length); ax.set_xlabel("AA Position"); ax.set_yticks([])
68
- ax.set_title("Sequence Features"); leg_h = {}
69
- y_pos, leg_set, b_h = 0, set(), 0.8
70
- for feat in sorted(features, key=lambda x: x["begin"]):
71
- b, e, c = feat["begin"], feat["end"], feat["color"]; w = max(1, e - b + 1)
72
- ax.barh(y_pos, w, height=b_h, left=b -1, color=c, edgecolor='black', alpha=0.7)
73
- if feat['type'] not in leg_set: leg_h[feat['type']] = plt.Rectangle((0, 0), 1, 1, fc=c, alpha=0.7); leg_set.add(feat['type'])
74
- y_pos += 1
75
- if y_pos > 0: ax.set_ylim(-0.5, y_pos -1 + b_h/2 + 0.5)
 
 
 
76
  else: plt.close(fig); return None
77
- if leg_h: ax.legend(leg_h.values(), leg_h.keys(), title="Feature Types", bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
78
  plt.tight_layout(rect=[0, 0, 0.83, 0.96])
79
- buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
80
- def extract_interactions(uniprot_data): # ... (same as before) ...
81
- interactions = []
 
 
 
82
  if "comments" in uniprot_data:
83
- for c in uniprot_data["comments"]:
84
- if c.get("commentType") == "INTERACTION" and "interactions" in c:
85
- for i_entry in c["interactions"]:
86
- i1_acc = i_entry.get("interactantOne", {}).get("uniProtKBAccession")
87
- i2_acc = i_entry.get("interactantTwo", {}).get("uniProtKBAccession")
88
- i2_gene = i_entry.get("interactantTwo", {}).get("geneName")
89
- if i1_acc == uniprot_data.get("primaryAccession") and i2_acc:
90
- partner = f"{i2_gene} ({i2_acc})" if i2_gene else i2_acc
91
- interactions.append(f"- Interacts with: **{partner}**")
92
- return "\n".join(sorted(interactions)) if interactions else "No interaction partners listed in comments."
93
- def extract_pathways(uniprot_data): # ... (same as before) ...
94
- pathways = []; dbs = { "KEGG": "...", "Reactome": "..." } # URLs omitted
 
 
 
 
 
 
 
95
  if "uniProtKBCrossReferences" in uniprot_data:
96
  for xref in uniprot_data["uniProtKBCrossReferences"]:
97
- db = xref.get("database"); pid = xref.get("id")
98
- if db in dbs and pid:
99
- desc = pid;
100
  if "properties" in xref:
101
- for p in xref["properties"]:
102
- if p.get("key") in ["PathwayName", "Description"]: desc = f"{p.get('value')} ({pid})"; break
103
- link = dbs[db] + pid; pathways.append(f"- [{desc}]({link}) ({db})")
104
- return "\n".join(sorted(list(set(pathways)))) if pathways else "No KEGG/Reactome pathway info."
105
- def extract_disease_info(uniprot_data): # ... (same as before) ...
106
- diseases = []
 
 
 
 
 
 
107
  if "comments" in uniprot_data:
108
- for c in uniprot_data["comments"]:
109
- if c.get("commentType") == "DISEASE" and "disease" in c:
110
- d_entry = c["disease"]; d_name = d_entry.get("diseaseId", "?"); desc = d_entry.get("description", "N/A")
111
- mim = None;
112
- if "diseaseCrossReference" in d_entry and d_entry["diseaseCrossReference"].get("database") == "MIM": mim = d_entry["diseaseCrossReference"].get("id")
113
- d_md = f"**{d_name}**" + (f" (MIM: [{mim}](https://omim.org/entry/{mim}))" if mim else "")
114
- d_md += f"\n - *Desc:* {desc}\n"; note_val = c.get("note", {}).get("texts", [{}])[0].get("value")
115
- if note_val: d_md += f" - *Note:* {note_val}\n"
116
- diseases.append(d_md)
117
- return "\n---\n".join(sorted(diseases)) if diseases else "No disease association info."
118
- def extract_publications(uniprot_data): # ... (same as before) ...
119
- pubs = []
120
- if "references" in uniprot_data:
121
- for i, ref in enumerate(uniprot_data.get("references", [])):
122
- cit = ref.get("citation", {}); title = cit.get("title", "N/A"); authors = ", ".join(cit.get("authors", ["N/A"]))
123
- j = cit.get("journalName", ""); v = cit.get("volume", ""); f = cit.get("firstPage", ""); l = cit.get("lastPage", ""); d = cit.get("publicationDate", "")
124
- pmid, doi = None, None
125
- if "citationCrossReferences" in cit:
126
- for xr in cit["citationCrossReferences"]:
127
- if xr.get("database") == "PubMed": pmid = xr.get("id")
128
- elif xr.get("database") == "DOI": doi = xr.get("id")
129
- md = f"**{i + 1}. {title}**\n - *{authors}*\n - *{j}" + (f", {v}" if v else "") + (f":{f}" if f else "") + (f"-{l}" if l else "")
130
- if d: md += f" ({d})"
131
- md += "*\n"
132
- if pmid: md += f" - [PubMed {pmid}](https://pubmed.ncbi.nlm.nih.gov/{pmid}/)\n"
133
- if doi: md += f" - [DOI {doi}](https://doi.org/{doi})\n"
134
- pubs.append(md)
135
- return "\n---\n".join(pubs) if pubs else "No publication info found."
136
- def extract_cross_references(uniprot_data): # ... (same as before) ...
137
- xrefs = []; dbs = {"Ensembl": "...", "GeneID": "...", "RefSeq": "...", "GO": "...", "InterPro": "...", "Pfam": "...", "PDB": "...", "KEGG": "...", "Reactome": "..."} # URLs omitted
138
- grouped = {db: [] for db in dbs};
139
- if "uniProtKBCrossReferences" in uniprot_data:
140
- for xr in uniprot_data["uniProtKBCrossReferences"]:
141
- db = xr.get("database"); xid = xr.get("id")
142
- if db in dbs and xid:
143
- url, txt = None, xid;
144
- if db == "GO": txt = xr.get("properties", [{}])[0].get("value", xid) + f" ({xid})"
145
- url = dbs[db] + xid
146
- if url: link_md = f"[{txt}]({url})";
147
- if link_md not in grouped[db]: grouped[db].append(link_md)
148
- for db, links in grouped.items():
149
- if links: xrefs.append(f"**{db}:** " + ", ".join(sorted(list(set(links)))))
150
- return "\n".join(xrefs) if xrefs else "No selected cross-references."
151
 
152
- # --- Search Function ---
153
- def search_uniprot_by_name(search_term, result_limit=5):
154
- # ... (Implementation from previous version, returns status_md, dataset_data) ...
155
- if not search_term or len(search_term) < 3: return "Enter at least 3 characters.", []
156
- params = { "query": f'({search_term}) AND (reviewed:true)', "fields": "accession,id,protein_name,organism_name", "format": "json", "size": result_limit }
157
- status_md = f"### Search Results for '{search_term}':\n"; dataset_data = []
158
- try:
159
- response = requests.get(UNIPROT_SEARCH_URL, params=params); response.raise_for_status(); data = response.json()
160
- results = data.get("results")
161
- if not results: status_md += "No reviewed entries found."
162
- else:
163
- status_md += f"*Found {len(results)}. Select a row below, then copy the ID.*\n---"
164
- for entry in results:
165
- acc = entry.get("primaryAccession", "N/A"); uid = entry.get("uniProtkbId", ""); name = "N/A"
166
- if entry.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value"): name = entry["proteinDescription"]["recommendedName"]["fullName"]["value"]
167
- elif entry.get("proteinDescription", {}).get("submissionNames"): name = entry["proteinDescription"]["submissionNames"][0].get("fullName",{}).get("value", "N/A")
168
- org = entry.get("organism", {}).get("scientificName", "N/A"); dataset_data.append([acc, uid, name, org])
169
- status_md += "\n---"
170
- except Exception as e: status_md += f"\n**Search Error:** {e}"
171
- return status_md, dataset_data
 
 
 
 
 
 
 
 
 
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- # --- Function to update copy box (Corrected) ---
175
- def update_copy_box(results_data, evt: gr.SelectData):
176
- """Updates the textbox with the Accession ID from the selected row."""
177
- if evt.index is None or results_data is None:
178
- return "" # No selection or no data
179
-
180
- row_index = evt.index[0] # Get the row index of the selection
181
-
182
- # Ensure the row index is valid for the current dataset results
183
- if 0 <= row_index < len(results_data):
184
- selected_row = results_data[row_index]
185
- accession_id = selected_row[0] # Accession ID is the first element
186
- return accession_id
187
- return "" # Return empty if index is out of bounds
188
 
189
- # --- Main Function to Get All Info ---
190
  def get_protein_info(uniprot_id):
191
- # ... (Implementation from v1.5.1, returns 9 outputs) ...
192
- empty_plot = gr.update(value=None, visible=False); empty_str = ""; err_msg = "Error"
193
- outputs_on_error = (err_msg, empty_plot, empty_plot, empty_str, empty_str, empty_str, empty_str, empty_str, empty_str)
194
- if not uniprot_id: return ("Enter UniProt ID.",) + outputs_on_error[1:]
 
 
195
  url = UNIPROT_API_URL.format(accession=uniprot_id.strip().upper())
196
  try:
197
  response = requests.get(url); response.raise_for_status(); data = response.json()
 
198
  acc = data.get("primaryAccession", "N/A"); id_display = data.get("uniProtkbId", "N/A")
199
- link = f"https://www.uniprot.org/uniprotkb/{acc}/entry"; name_dict = data.get("proteinDescription", {}).get("recommendedName", {}); name = name_dict.get("fullName", {}).get("value", "N/A")
 
200
  if name == "N/A" and data.get("proteinDescription", {}).get("submissionNames"): name = data["proteinDescription"]["submissionNames"][0].get("fullName", {}).get("value", "N/A")
201
  genes_data = data.get("genes"); gene_str = "N/A"
202
  if genes_data:
203
  g_list = [g.get("geneName", {}).get("value", "") for g in genes_data if g.get("geneName")]
204
  if not g_list: g_list = [g.get("orfNames", [{}])[0].get("value", "") for g in genes_data if g.get("orfNames")]
205
  gene_str = ", ".join(filter(None, g_list)) or "N/A"
206
- org_data = data.get("organism", {}); org_sci_name = org_data.get("scientificName", "N/A"); org_common_name = org_data.get("commonName", "")
207
- org_display = f"{org_sci_name}" + (f" ({org_common_name})" if org_common_name else ""); seq_info = data.get("sequence", {}); seq_val = seq_info.get("value", "N/A"); length = seq_info.get("length", 0)
208
- status_val = data.get("entryAudit", {}).get("entryType", "N/A").replace("UniProtKB ", ""); existence_val = data.get("proteinExistence", "N/A").replace(": Evidence at ", ": ")
209
- score_val = data.get("annotationScore", "N/A"); mw_str = "N/A"
 
 
 
 
210
  if seq_val != "N/A" and length > 0:
211
  try:
212
  clean_seq = "".join(filter(lambda x: x in AMINO_ACID_NAMES, seq_val.upper()))
@@ -214,114 +277,91 @@ def get_protein_info(uniprot_id):
214
  else: mw_str = "Invalid sequence for MW"
215
  except: mw_str = "Error in MW calc"
216
  comments_data = data.get("comments", []); func_comment = "N/A"
217
- for c_item in comments_data:
218
- if c_item.get("commentType") == "FUNCTION":
219
- texts = c_item.get("texts", [])
220
- if texts:
221
- func_comment = texts[0].get("value", "N/A")
222
- break
223
- overview_md = (f"## {id_display} ({acc})\n[{acc} on UniProt]({link})\n\n**Protein:** {name}\n**Gene:** {gene_str}\n**Status:** {status_val}\n"
224
- f"**Organism:** {org_display}\n**Length:** {length} aa\n**Existence:** {existence_val}\n**Score:** {score_val}/5\n**Calc. MW:** {mw_str}\n\n"
225
- f"**Function Snippet:**\n{func_comment}\n\n**Sequence (first 100 aa):**\n`{seq_val[:100]}{'...' if len(seq_val) > 100 else ''}`\n\n--- \n*More details in other tabs.*")
226
- interactions_md = extract_interactions(data); pathways_md = extract_pathways(data)
227
- disease_md = extract_disease_info(data); publications_md = extract_publications(data); xref_md = extract_cross_references(data)
228
- aa_freq, aa_err = get_amino_acid_frequencies(seq_val); aa_plot_upd = empty_plot
 
 
 
 
 
 
 
 
 
 
229
  if aa_err: overview_md += f"\n\n**AA Freq Error:** {aa_err}"
230
- elif aa_freq: img_aa = plot_amino_acid_frequencies(aa_freq);
 
231
  if img_aa: aa_plot_upd = gr.update(value=img_aa, visible=True)
232
- seq_feat = extract_sequence_features(data); feat_plot_upd = empty_plot; feat_msg = ""
 
 
233
  if seq_feat and length > 0:
234
  img_feat = plot_sequence_features(length, seq_feat)
235
  if img_feat: feat_plot_upd = gr.update(value=img_feat, visible=True)
236
- else: feat_msg = "Could not generate feature plot."
237
  elif not seq_feat and length > 0 : feat_msg = "No relevant features found for plotting."
238
- return (overview_md, aa_plot_upd, feat_plot_upd, feat_msg, pathways_md, interactions_md, disease_md, publications_md, xref_md)
 
 
 
239
  except requests.exceptions.HTTPError as e:
240
  err_msg_http = f"Error: ID '{uniprot_id}' not found." if e.response.status_code == 404 else f"HTTP error: {e}"
241
  return (err_msg_http,) + outputs_on_error[1:]
242
- except Exception as e: return (f"Error: {str(e)[:150]}",) + outputs_on_error[1:]
 
243
 
244
- # --- Gradio UI Definition ---
245
  with gr.Blocks(theme=gr.themes.Glass()) as iface:
246
- gr.Markdown("# Protein Profile Viewer (v1.6.2 - Select Fix)") # Version updated
247
- gr.Markdown("Enter a UniProt ID directly, **OR** search for a protein/gene name below to find and copy its ID.")
248
-
249
- with gr.Group():
250
- gr.Markdown("### Find UniProt ID by Name/Keyword")
251
- search_term_input = gr.Textbox(label="Search Term (min 3 chars, type and wait)", placeholder="e.g., insulin, EGFR, P53")
252
- search_status_output = gr.Markdown()
253
- # Dataset to store search results data (invisible, used as state)
254
- search_results_data_state = gr.State([])
255
- # Dataset component for display
256
- search_results_output_display = gr.Dataset(
257
- label="Search Results (Select a row to copy Accession)",
258
- headers=["Accession", "UniProtKB ID", "Protein Name", "Organism"],
259
- samples=[], samples_per_page=5
260
- )
261
- selected_id_to_copy = gr.Textbox(label="Copy this Accession ID:", interactive=True, show_copy_button=True)
262
-
263
- gr.Markdown("---")
264
-
265
- gr.Markdown("### View Protein Profile")
266
  with gr.Row():
267
- protein_id_input = gr.Textbox(label="Enter UniProt Accession ID", placeholder="Paste ID here or enter directly", scale=3)
268
- submit_button = gr.Button("Get Profile", scale=1, variant="primary")
269
-
270
  with gr.Tabs():
271
- # ... (Tabs definition remains the same) ...
272
  with gr.TabItem("Overview"):
273
- gr.Markdown("### Protein Overview\nKey information...")
274
- overview_output = gr.Markdown()
275
  with gr.TabItem("Analysis Plots"):
276
- gr.Markdown("### Sequence Analysis Visualizations\nAmino acid composition and annotated features.")
277
- with gr.Column():
278
- aa_freq_plot_output = gr.Image(label="Amino Acid Frequency Plot", type="pil", show_label=True, visible=False)
279
- seq_features_plot_output = gr.Image(label="Sequence Features Plot", type="pil", show_label=True, visible=False)
280
- seq_features_message_output = gr.Markdown()
281
  with gr.TabItem("Functional Context"):
282
- gr.Markdown("### Pathways, Interactions & Disease\nBiological context.")
283
- with gr.Accordion("Biological Pathways", open=False): pathways_output = gr.Markdown()
284
- with gr.Accordion("Protein Interactions", open=False): interactions_output = gr.Markdown()
285
- with gr.Accordion("Disease Associations", open=False): disease_output = gr.Markdown()
 
 
 
286
  with gr.TabItem("Publications"):
287
- gr.Markdown("### Relevant Publications\nAssociated scientific literature.")
288
- publications_output = gr.Markdown()
289
- with gr.TabItem("Cross-references"):
290
- gr.Markdown("### Database Links\nLinks to other relevant databases.")
291
- xref_output = gr.Markdown()
292
 
293
- # --- Event Handlers (Corrected) ---
294
- search_term_input.change(
295
- fn=search_uniprot_by_name,
296
- inputs=search_term_input,
297
- # Output both the status message and the data for the Dataset state
298
- outputs=[search_status_output, search_results_data_state]
299
- )
300
-
301
- # When the state data changes, update the displayed Dataset
302
- search_results_data_state.change(
303
- fn=lambda data: data, # Simple function to pass data through
304
- inputs=search_results_data_state,
305
- outputs=search_results_output_display
306
- )
307
-
308
- # When a row is selected in the displayed Dataset, update the copy box
309
- search_results_output_display.select(
310
- fn=update_copy_box,
311
- # Pass the *state* containing the data and the event data
312
- inputs=[search_results_data_state],
313
- outputs=selected_id_to_copy
314
- # Removed the incorrect _js argument
315
- )
316
-
317
- # Main profile submit button action
318
  submit_button.click(
319
- fn=get_protein_info, inputs=protein_id_input,
 
320
  outputs=[overview_output, aa_freq_plot_output, seq_features_plot_output, seq_features_message_output,
321
  pathways_output, interactions_output, disease_output, publications_output, xref_output]
322
  )
323
-
324
- gr.Examples(examples=[["P05067"], ["P00533"], ["Q9BYF1"], ["P0DP23"], ["P04637"]], inputs=protein_id_input)
 
 
325
 
326
  if __name__ == "__main__":
327
  iface.launch()
 
8
  import sys
9
  import traceback
10
  import json
 
11
 
12
  UNIPROT_API_URL = "https://rest.uniprot.org/uniprotkb/{accession}.json"
 
13
 
14
  AMINO_ACID_NAMES = {
15
  'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine', 'D': 'Aspartic acid',
 
20
  }
21
  STANDARD_AMINO_ACIDS_ORDER = "ARNDCQEGHILKMFPSTWYV"
22
 
23
+ def get_amino_acid_frequencies(sequence):
 
24
  if not sequence or sequence == "N/A": return None, "Sequence not available for analysis."
25
  cleaned_sequence = "".join(filter(lambda x: x in AMINO_ACID_NAMES, sequence.upper()))
26
+ if not cleaned_sequence: return None, "No valid amino acids found in sequence for counting."
27
+ counts = Counter(cleaned_sequence)
28
+ frequencies = {aa: counts.get(aa, 0) for aa in STANDARD_AMINO_ACIDS_ORDER}
29
  return frequencies, None
30
+
31
+ def plot_amino_acid_frequencies(frequencies):
32
  if not frequencies: return None
33
  ordered_keys = [key for key in STANDARD_AMINO_ACIDS_ORDER if key in frequencies]
34
+ labels = [f"{aa}: {AMINO_ACID_NAMES.get(aa, aa)}" for aa in ordered_keys]
35
+ values = [frequencies[aa] for aa in ordered_keys]
36
  fig, ax = plt.subplots(figsize=(12, 7)); ax.bar(labels, values, color='skyblue')
37
+ ax.set_xlabel("Amino Acid"); ax.set_ylabel("Frequency"); ax.set_title("Amino Acid Frequency Plot")
38
  plt.xticks(rotation=75, ha="right", fontsize=8); plt.tight_layout()
39
+ buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
40
+ img = Image.open(buf); plt.close(fig)
41
+ return img
42
+
43
+ def extract_sequence_features(uniprot_data):
44
+ features_of_interest_uppercase = {
45
+ "DOMAIN": "blue", "MOTIF": "green", "ACTIVE_SITE": "red",
46
+ "BINDING_SITE": "orange", "MOD_RES": "purple",
47
+ "HELIX": "cyan", "STRAND": "magenta", "TURN": "gold"
48
+ }
49
+ extracted_features = []
50
+ if "features" in uniprot_data and uniprot_data["features"] is not None:
51
+ for feature_item in uniprot_data["features"]:
52
+ feature_type_raw = feature_item.get("type")
53
+ if not isinstance(feature_type_raw, str): continue
54
+ feature_type_normalized = feature_type_raw.strip().upper()
55
+ if feature_type_normalized in features_of_interest_uppercase:
56
  try:
57
+ location_obj = feature_item.get("location", {})
58
+ begin_pos_val_str = None; end_pos_val_str = None
59
+ start_node = location_obj.get("start"); end_node = location_obj.get("end")
60
+ position_node = location_obj.get("position")
61
+ if start_node and isinstance(start_node, dict) and "value" in start_node: begin_pos_val_str = str(start_node["value"])
62
+ if end_node and isinstance(end_node, dict) and "value" in end_node: end_pos_val_str = str(end_node["value"])
63
+ if position_node and isinstance(position_node, dict) and "value" in position_node:
64
+ pos_val_str = str(position_node["value"])
65
+ if begin_pos_val_str is None: begin_pos_val_str = pos_val_str
66
+ if end_pos_val_str is None: end_pos_val_str = pos_val_str
67
+ if "start" not in location_obj and "end" not in location_obj:
68
+ begin_pos_val_str = pos_val_str; end_pos_val_str = pos_val_str
69
+ if begin_pos_val_str is None or end_pos_val_str is None: continue
70
+ begin_pos = int(begin_pos_val_str); end_pos = int(end_pos_val_str)
71
+ if begin_pos > end_pos: continue
72
+ extracted_features.append({
73
+ "type": feature_type_raw, "begin": begin_pos, "end": end_pos,
74
+ "description": feature_item.get("description", feature_type_raw),
75
+ "color": features_of_interest_uppercase[feature_type_normalized]
76
+ })
77
+ except (ValueError, TypeError, AttributeError): continue
78
+ return extracted_features
79
+
80
+ def plot_sequence_features(sequence_length, features):
81
  if not features or sequence_length == 0: return None
82
  fig, ax = plt.subplots(figsize=(12, max(3, len(features) * 0.4) + 1.5))
83
+ ax.set_xlim(0, sequence_length); ax.set_xlabel("Amino Acid Position"); ax.set_yticks([])
84
+ ax.set_title("Sequence Features Plot"); legend_handles = {}
85
+ y_pos_counter = 0; plotted_feature_types_in_legend = set(); bar_height = 0.8
86
+ for feature in sorted(features, key=lambda x: x["begin"]):
87
+ begin = feature["begin"]; end = feature["end"]; color = feature["color"]
88
+ width = max(1, end - begin + 1)
89
+ ax.barh(y_pos_counter, width, height=bar_height, left=begin -1, color=color, edgecolor='black', alpha=0.7)
90
+ if feature['type'] not in plotted_feature_types_in_legend:
91
+ legend_handles[feature['type']] = plt.Rectangle((0, 0), 1, 1, fc=color, alpha=0.7)
92
+ plotted_feature_types_in_legend.add(feature['type'])
93
+ y_pos_counter += 1
94
+ if y_pos_counter > 0: ax.set_ylim(-0.5, y_pos_counter -1 + bar_height/2 + 0.5)
95
  else: plt.close(fig); return None
96
+ if legend_handles: ax.legend(legend_handles.values(), legend_handles.keys(), title="Feature Types", bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
97
  plt.tight_layout(rect=[0, 0, 0.83, 0.96])
98
+ buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
99
+ img = Image.open(buf); plt.close(fig)
100
+ return img
101
+
102
+ def extract_interactions(uniprot_data):
103
+ interactions_list = []
104
  if "comments" in uniprot_data:
105
+ for comment in uniprot_data["comments"]:
106
+ if comment.get("commentType") == "INTERACTION" and "interactions" in comment:
107
+ for interaction_entry in comment["interactions"]:
108
+ interactant_one_acc = interaction_entry.get("interactantOne", {}).get("uniProtKBAccession")
109
+ interactant_two_acc = interaction_entry.get("interactantTwo", {}).get("uniProtKBAccession")
110
+ interactant_two_gene = interaction_entry.get("interactantTwo", {}).get("geneName")
111
+ if interactant_one_acc == uniprot_data.get("primaryAccession") and interactant_two_acc:
112
+ partner_display_name = interactant_two_acc
113
+ if interactant_two_gene:
114
+ partner_display_name = f"{interactant_two_gene} ({interactant_two_acc})"
115
+ interactions_list.append(f"- Interacts with: **{partner_display_name}**")
116
+ if not interactions_list:
117
+ return "No specific interaction partners listed in UniProt comments."
118
+ # Sort alphabetically for consistency
119
+ return "\n".join(sorted(interactions_list))
120
+
121
+ def extract_pathways(uniprot_data):
122
+ pathways = []
123
+ pathway_databases = { "KEGG": "https://www.genome.jp/dbget-bin/www_bget?", "Reactome": "https://reactome.org/content/detail/" }
124
  if "uniProtKBCrossReferences" in uniprot_data:
125
  for xref in uniprot_data["uniProtKBCrossReferences"]:
126
+ db_name = xref.get("database")
127
+ if db_name in pathway_databases:
128
+ pathway_id = xref.get("id"); pathway_description = ""
129
  if "properties" in xref:
130
+ for prop in xref["properties"]:
131
+ if prop.get("key") == "PathwayName" or prop.get("key") == "Description":
132
+ pathway_description = prop.get("value"); break
133
+ if pathway_id:
134
+ link = pathway_databases[db_name] + pathway_id
135
+ display_text = f"{pathway_description} ({pathway_id})" if pathway_description else pathway_id
136
+ pathways.append(f"- [{display_text}]({link}) ({db_name})")
137
+ if not pathways: return "No pathway information found in KEGG or Reactome cross-references."
138
+ return "\n".join(sorted(list(set(pathways))))
139
+
140
+ def extract_disease_info(uniprot_data):
141
+ disease_info_list = []
142
  if "comments" in uniprot_data:
143
+ for comment in uniprot_data["comments"]:
144
+ if comment.get("commentType") == "DISEASE" and "disease" in comment:
145
+ disease_entry = comment["disease"]
146
+ disease_name = disease_entry.get("diseaseId", "Unknown disease")
147
+ description = disease_entry.get("description", "No description available.")
148
+ mim_id = None
149
+ if "diseaseCrossReference" in disease_entry and disease_entry["diseaseCrossReference"].get("database") == "MIM":
150
+ mim_id = disease_entry["diseaseCrossReference"].get("id")
151
+ disease_md = f"**{disease_name}**"
152
+ if mim_id: disease_md += f" (MIM: [{mim_id}](https://www.omim.org/entry/{mim_id}))"
153
+ disease_md += f"\n - *Description:* {description}\n"
154
+ if "note" in comment and "texts" in comment["note"]:
155
+ for note_text_obj in comment["note"]["texts"]:
156
+ note_val = note_text_obj.get("value")
157
+ if note_val: disease_md += f" - *Note:* {note_val}\n"
158
+ disease_info_list.append(disease_md)
159
+ if not disease_info_list: return "No specific disease association information found in UniProt comments."
160
+ return "\n---\n".join(sorted(disease_info_list)) # Sort alphabetically by disease name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ def extract_publications(uniprot_data):
163
+ publications_list = []
164
+ if "references" in uniprot_data:
165
+ for ref_idx, ref in enumerate(uniprot_data.get("references", [])):
166
+ citation = ref.get("citation", {})
167
+ title = citation.get("title", "N/A")
168
+ authors = ", ".join(citation.get("authors", ["N/A"]))
169
+ journal = citation.get("journalName", "N/A")
170
+ volume = citation.get("volume", "")
171
+ first_page = citation.get("firstPage", "")
172
+ last_page = citation.get("lastPage", "")
173
+ publication_date = citation.get("publicationDate", "")
174
+ pubmed_id = None; doi_id = None
175
+ if "citationCrossReferences" in citation:
176
+ for xref_cite in citation["citationCrossReferences"]:
177
+ if xref_cite.get("database") == "PubMed": pubmed_id = xref_cite.get("id")
178
+ elif xref_cite.get("database") == "DOI": doi_id = xref_cite.get("id")
179
+ pub_md = f"**{ref_idx + 1}. Title:** {title}\n"
180
+ pub_md += f" - *Authors:* {authors}\n"
181
+ pub_md += f" - *Journal:* {journal}"
182
+ if volume: pub_md += f", Vol. {volume}"
183
+ if first_page: pub_md += f", pp. {first_page}"
184
+ if last_page: pub_md += f"-{last_page}"
185
+ if publication_date: pub_md += f" ({publication_date})"
186
+ pub_md += "\n"
187
+ if pubmed_id: pub_md += f" - *PubMed:* [{pubmed_id}](https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}/)\n"
188
+ if doi_id: pub_md += f" - *DOI:* [{doi_id}](https://doi.org/{doi_id})\n"
189
+ publications_list.append(pub_md)
190
+ if not publications_list: return "No publication information found in this UniProt entry."
191
+ return "\n---\n".join(publications_list)
192
 
193
+ def extract_cross_references(uniprot_data):
194
+ xref_list = []
195
+ target_databases = {
196
+ "Ensembl": "https://www.ensembl.org/id/", "GeneID": "https://www.ncbi.nlm.nih.gov/gene/",
197
+ "RefSeq": "https://www.ncbi.nlm.nih.gov/nuccore/", "GO": "https://amigo.geneontology.org/amigo/term/",
198
+ "InterPro": "https://www.ebi.ac.uk/interpro/entry/InterPro/",
199
+ "Pfam": "https://www.ebi.ac.uk/interpro/entry/pfam/",
200
+ "PDB": "https://www.rcsb.org/structure/",
201
+ "KEGG": "https://www.genome.jp/dbget-bin/www_bget?", # Add KEGG gene link
202
+ "Reactome": "https://reactome.org/content/detail/" # Reactome protein link (usually same as pathway)
203
+ }
204
+ grouped_xrefs = {db: [] for db in target_databases}
205
+ if "uniProtKBCrossReferences" in uniprot_data:
206
+ for xref in uniprot_data["uniProtKBCrossReferences"]:
207
+ db_name = xref.get("database")
208
+ if db_name in target_databases:
209
+ xref_id = xref.get("id"); link_url = None; display_text = xref_id
210
+ if db_name == "Ensembl" and "properties" in xref:
211
+ for prop in xref["properties"]:
212
+ if prop.get("key") == "GeneId":
213
+ ensembl_gene_id = prop.get("value")
214
+ link_url = f"https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={ensembl_gene_id}"
215
+ display_text = ensembl_gene_id; break
216
+ if link_url is None:
217
+ for prop in xref["properties"]:
218
+ if prop.get("key") == "ProteinId":
219
+ ensembl_prot_id = prop.get("value")
220
+ link_url = f"https://www.ensembl.org/Homo_sapiens/Transcript/Summary?p={ensembl_prot_id}"
221
+ display_text = ensembl_prot_id; break
222
+ elif db_name == "RefSeq" and "properties" in xref:
223
+ for prop in xref["properties"]:
224
+ if prop.get("key") == "ProteinId" or prop.get("key") == "NucleotideSequenceId":
225
+ refseq_id = prop.get("value")
226
+ link_url = target_databases[db_name] + refseq_id
227
+ display_text = refseq_id; break
228
+ elif db_name == "GO" and xref_id:
229
+ term_name = xref_id
230
+ if "properties" in xref:
231
+ for prop in xref["properties"]:
232
+ if prop.get("key") == "GoTerm": term_name = prop.get("value"); break
233
+ link_url = target_databases[db_name] + xref_id; display_text = f"{term_name} ({xref_id})"
234
+ elif db_name == "Pfam" and xref_id:
235
+ link_url = f"https://www.ebi.ac.uk/interpro/entry/pfam/{xref_id}"
236
+ elif xref_id: link_url = target_databases[db_name] + xref_id
237
+
238
+ if link_url: grouped_xrefs[db_name].append(f"[{display_text}]({link_url})")
239
 
240
+ for db_name, links in grouped_xrefs.items():
241
+ if links: xref_list.append(f"**{db_name}:** " + ", ".join(sorted(list(set(links)))))
242
+ if not xref_list: return "No cross-references found for the selected databases."
243
+ return "\n".join(xref_list)
 
 
 
 
 
 
 
 
 
 
244
 
 
245
  def get_protein_info(uniprot_id):
246
+ empty_plot = gr.update(value=None, visible=False); empty_str = ""; err_msg_default = "Error."
247
+ outputs_on_error = (err_msg_default, empty_plot, empty_plot, empty_str,
248
+ empty_str, empty_str, empty_str, empty_str, empty_str)
249
+ if not uniprot_id:
250
+ return ("Please enter a UniProt ID.",) + outputs_on_error[1:]
251
+
252
  url = UNIPROT_API_URL.format(accession=uniprot_id.strip().upper())
253
  try:
254
  response = requests.get(url); response.raise_for_status(); data = response.json()
255
+
256
  acc = data.get("primaryAccession", "N/A"); id_display = data.get("uniProtkbId", "N/A")
257
+ uniprot_link = f"https://www.uniprot.org/uniprotkb/{acc}/entry"
258
+ name_dict = data.get("proteinDescription", {}).get("recommendedName", {}); name = name_dict.get("fullName", {}).get("value", "N/A")
259
  if name == "N/A" and data.get("proteinDescription", {}).get("submissionNames"): name = data["proteinDescription"]["submissionNames"][0].get("fullName", {}).get("value", "N/A")
260
  genes_data = data.get("genes"); gene_str = "N/A"
261
  if genes_data:
262
  g_list = [g.get("geneName", {}).get("value", "") for g in genes_data if g.get("geneName")]
263
  if not g_list: g_list = [g.get("orfNames", [{}])[0].get("value", "") for g in genes_data if g.get("orfNames")]
264
  gene_str = ", ".join(filter(None, g_list)) or "N/A"
265
+ org_data = data.get("organism", {}); org_sci_name = org_data.get("scientificName", "N/A")
266
+ org_common_name = org_data.get("commonName", "")
267
+ org_display = f"{org_sci_name}" + (f" ({org_common_name})" if org_common_name else "")
268
+ seq_info = data.get("sequence", {}); seq_val = seq_info.get("value", "N/A"); length = seq_info.get("length", 0)
269
+ status_val = data.get("entryAudit", {}).get("entryType", "N/A").replace("UniProtKB ", "")
270
+ existence_val = data.get("proteinExistence", "N/A").replace(": Evidence at ", ": ")
271
+ score_val = data.get("annotationScore", "N/A")
272
+ mw_str = "N/A"
273
  if seq_val != "N/A" and length > 0:
274
  try:
275
  clean_seq = "".join(filter(lambda x: x in AMINO_ACID_NAMES, seq_val.upper()))
 
277
  else: mw_str = "Invalid sequence for MW"
278
  except: mw_str = "Error in MW calc"
279
  comments_data = data.get("comments", []); func_comment = "N/A"
280
+ for c_item in comments_data:
281
+ if c_item.get("commentType") == "FUNCTION":
282
+ texts = c_item.get("texts", [])
283
+ if texts: func_comment = texts[0].get("value", "N/A"); break
284
+
285
+ overview_md = (f"## {id_display} ({acc})\n"
286
+ f"[{acc} on UniProt]({uniprot_link})\n\n"
287
+ f"**Protein:** {name}\n**Gene:** {gene_str}\n**Status:** {status_val}\n"
288
+ f"**Organism:** {org_display}\n**Length:** {length} aa\n"
289
+ f"**Existence:** {existence_val}\n**Score:** {score_val}/5\n**Calc. MW:** {mw_str}\n\n"
290
+ f"**Function Snippet:**\n{func_comment}\n\n"
291
+ f"**Sequence (first 100 aa):**\n`{seq_val[:100]}{'...' if len(seq_val) > 100 else ''}`\n\n"
292
+ f"--- \n*More details in other tabs.*")
293
+
294
+ interactions_md = extract_interactions(data)
295
+ pathways_md = extract_pathways(data)
296
+ disease_md = extract_disease_info(data)
297
+ publications_md = extract_publications(data)
298
+ xref_md = extract_cross_references(data)
299
+
300
+ aa_freq, aa_err = get_amino_acid_frequencies(seq_val)
301
+ aa_plot_upd = empty_plot
302
  if aa_err: overview_md += f"\n\n**AA Freq Error:** {aa_err}"
303
+ elif aa_freq:
304
+ img_aa = plot_amino_acid_frequencies(aa_freq)
305
  if img_aa: aa_plot_upd = gr.update(value=img_aa, visible=True)
306
+
307
+ seq_feat = extract_sequence_features(data)
308
+ feat_plot_upd = empty_plot; feat_msg = ""
309
  if seq_feat and length > 0:
310
  img_feat = plot_sequence_features(length, seq_feat)
311
  if img_feat: feat_plot_upd = gr.update(value=img_feat, visible=True)
312
+ else: feat_msg = "Could not generate sequence feature plot."
313
  elif not seq_feat and length > 0 : feat_msg = "No relevant features found for plotting."
314
+
315
+ return (overview_md, aa_plot_upd, feat_plot_upd, feat_msg,
316
+ pathways_md, interactions_md, disease_md, publications_md, xref_md)
317
+
318
  except requests.exceptions.HTTPError as e:
319
  err_msg_http = f"Error: ID '{uniprot_id}' not found." if e.response.status_code == 404 else f"HTTP error: {e}"
320
  return (err_msg_http,) + outputs_on_error[1:]
321
+ except Exception as e:
322
+ return (f"Error: {str(e)[:150]}",) + outputs_on_error[1:]
323
 
 
324
  with gr.Blocks(theme=gr.themes.Glass()) as iface:
325
+ gr.Markdown("# Protein Profile Viewer (v1.4 - Cross-references)")
326
+ gr.Markdown("Enter a UniProt ID to explore its details including overview, sequence analysis, functional context, publications, and links to other databases.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  with gr.Row():
328
+ protein_id_input = gr.Textbox(label="Enter UniProt ID", placeholder="e.g., P00533", scale=3)
329
+ submit_button = gr.Button("Submit", scale=1, variant="primary")
 
330
  with gr.Tabs():
 
331
  with gr.TabItem("Overview"):
332
+ gr.Markdown("### Protein Overview\nKey information about the protein, including its UniProt ID, name, gene, organism, length, function snippet, and a link to the full UniProt entry. The first 100 amino acids of the sequence are also displayed here.")
333
+ overview_output = gr.Markdown()
334
  with gr.TabItem("Analysis Plots"):
335
+ gr.Markdown("### Sequence Analysis Visualizations\nGraphical representations of amino acid composition and annotated sequence features.")
336
+ with gr.Column():
337
+ aa_freq_plot_output = gr.Image(label="Amino Acid Frequency Plot", type="pil", show_label=True, visible=False)
338
+ seq_features_plot_output = gr.Image(label="Sequence Features Plot", type="pil", show_label=True, visible=False)
339
+ seq_features_message_output = gr.Markdown()
340
  with gr.TabItem("Functional Context"):
341
+ gr.Markdown("### Pathways, Interactions & Disease\nBiological context: pathways, interaction partners, and associated diseases.")
342
+ with gr.Accordion("Biological Pathways (KEGG, Reactome)", open=False):
343
+ pathways_output = gr.Markdown()
344
+ with gr.Accordion("Protein Interactions", open=False):
345
+ interactions_output = gr.Markdown()
346
+ with gr.Accordion("Disease Associations", open=False):
347
+ disease_output = gr.Markdown()
348
  with gr.TabItem("Publications"):
349
+ gr.Markdown("### Relevant Publications\nA list of scientific publications from UniProt.")
350
+ publications_output = gr.Markdown()
351
+ with gr.TabItem("Cross-references"): # New Tab
352
+ gr.Markdown("### Database Links\nLinks to this protein's entry in other relevant biological databases (e.g., Ensembl, RefSeq, GO, PDB).")
353
+ xref_output = gr.Markdown() # New output component
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  submit_button.click(
356
+ fn=get_protein_info,
357
+ inputs=protein_id_input,
358
  outputs=[overview_output, aa_freq_plot_output, seq_features_plot_output, seq_features_message_output,
359
  pathways_output, interactions_output, disease_output, publications_output, xref_output]
360
  )
361
+ gr.Examples(
362
+ examples=[["P05067"], ["P00533"], ["Q9BYF1"], ["P0DP23"], ["P04637"]],
363
+ inputs=protein_id_input
364
+ )
365
 
366
  if __name__ == "__main__":
367
  iface.launch()