Spaces:
Runtime error
Runtime error
Kaveh commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,10 +8,8 @@ from PIL import Image
|
|
| 8 |
import sys
|
| 9 |
import traceback
|
| 10 |
import json
|
| 11 |
-
import pandas as pd # Keep import, might be useful later
|
| 12 |
|
| 13 |
UNIPROT_API_URL = "https://rest.uniprot.org/uniprotkb/{accession}.json"
|
| 14 |
-
UNIPROT_SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search"
|
| 15 |
|
| 16 |
AMINO_ACID_NAMES = {
|
| 17 |
'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine', 'D': 'Aspartic acid',
|
|
@@ -22,191 +20,256 @@ AMINO_ACID_NAMES = {
|
|
| 22 |
}
|
| 23 |
STANDARD_AMINO_ACIDS_ORDER = "ARNDCQEGHILKMFPSTWYV"
|
| 24 |
|
| 25 |
-
|
| 26 |
-
def get_amino_acid_frequencies(sequence): # ... (same as before) ...
|
| 27 |
if not sequence or sequence == "N/A": return None, "Sequence not available for analysis."
|
| 28 |
cleaned_sequence = "".join(filter(lambda x: x in AMINO_ACID_NAMES, sequence.upper()))
|
| 29 |
-
if not cleaned_sequence: return None, "No valid amino acids found for counting."
|
| 30 |
-
counts = Counter(cleaned_sequence)
|
|
|
|
| 31 |
return frequencies, None
|
| 32 |
-
|
|
|
|
| 33 |
if not frequencies: return None
|
| 34 |
ordered_keys = [key for key in STANDARD_AMINO_ACIDS_ORDER if key in frequencies]
|
| 35 |
-
labels = [f"{aa}: {AMINO_ACID_NAMES.get(aa, aa)}" for aa in ordered_keys]
|
|
|
|
| 36 |
fig, ax = plt.subplots(figsize=(12, 7)); ax.bar(labels, values, color='skyblue')
|
| 37 |
-
ax.set_xlabel("Amino Acid"); ax.set_ylabel("Frequency"); ax.set_title("
|
| 38 |
plt.xticks(rotation=75, ha="right", fontsize=8); plt.tight_layout()
|
| 39 |
-
buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
try:
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
if not features or sequence_length == 0: return None
|
| 66 |
fig, ax = plt.subplots(figsize=(12, max(3, len(features) * 0.4) + 1.5))
|
| 67 |
-
ax.set_xlim(0, sequence_length); ax.set_xlabel("
|
| 68 |
-
ax.set_title("Sequence Features");
|
| 69 |
-
|
| 70 |
-
for
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
else: plt.close(fig); return None
|
| 77 |
-
if
|
| 78 |
plt.tight_layout(rect=[0, 0, 0.83, 0.96])
|
| 79 |
-
buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
if "comments" in uniprot_data:
|
| 83 |
-
for
|
| 84 |
-
if
|
| 85 |
-
for
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
if
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
if "uniProtKBCrossReferences" in uniprot_data:
|
| 96 |
for xref in uniprot_data["uniProtKBCrossReferences"]:
|
| 97 |
-
|
| 98 |
-
if
|
| 99 |
-
|
| 100 |
if "properties" in xref:
|
| 101 |
-
for
|
| 102 |
-
if
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
if "comments" in uniprot_data:
|
| 108 |
-
for
|
| 109 |
-
if
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
for xr in cit["citationCrossReferences"]:
|
| 127 |
-
if xr.get("database") == "PubMed": pmid = xr.get("id")
|
| 128 |
-
elif xr.get("database") == "DOI": doi = xr.get("id")
|
| 129 |
-
md = f"**{i + 1}. {title}**\n - *{authors}*\n - *{j}" + (f", {v}" if v else "") + (f":{f}" if f else "") + (f"-{l}" if l else "")
|
| 130 |
-
if d: md += f" ({d})"
|
| 131 |
-
md += "*\n"
|
| 132 |
-
if pmid: md += f" - [PubMed {pmid}](https://pubmed.ncbi.nlm.nih.gov/{pmid}/)\n"
|
| 133 |
-
if doi: md += f" - [DOI {doi}](https://doi.org/{doi})\n"
|
| 134 |
-
pubs.append(md)
|
| 135 |
-
return "\n---\n".join(pubs) if pubs else "No publication info found."
|
| 136 |
-
def extract_cross_references(uniprot_data): # ... (same as before) ...
|
| 137 |
-
xrefs = []; dbs = {"Ensembl": "...", "GeneID": "...", "RefSeq": "...", "GO": "...", "InterPro": "...", "Pfam": "...", "PDB": "...", "KEGG": "...", "Reactome": "..."} # URLs omitted
|
| 138 |
-
grouped = {db: [] for db in dbs};
|
| 139 |
-
if "uniProtKBCrossReferences" in uniprot_data:
|
| 140 |
-
for xr in uniprot_data["uniProtKBCrossReferences"]:
|
| 141 |
-
db = xr.get("database"); xid = xr.get("id")
|
| 142 |
-
if db in dbs and xid:
|
| 143 |
-
url, txt = None, xid;
|
| 144 |
-
if db == "GO": txt = xr.get("properties", [{}])[0].get("value", xid) + f" ({xid})"
|
| 145 |
-
url = dbs[db] + xid
|
| 146 |
-
if url: link_md = f"[{txt}]({url})";
|
| 147 |
-
if link_md not in grouped[db]: grouped[db].append(link_md)
|
| 148 |
-
for db, links in grouped.items():
|
| 149 |
-
if links: xrefs.append(f"**{db}:** " + ", ".join(sorted(list(set(links)))))
|
| 150 |
-
return "\n".join(xrefs) if xrefs else "No selected cross-references."
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
return "" # No selection or no data
|
| 179 |
-
|
| 180 |
-
row_index = evt.index[0] # Get the row index of the selection
|
| 181 |
-
|
| 182 |
-
# Ensure the row index is valid for the current dataset results
|
| 183 |
-
if 0 <= row_index < len(results_data):
|
| 184 |
-
selected_row = results_data[row_index]
|
| 185 |
-
accession_id = selected_row[0] # Accession ID is the first element
|
| 186 |
-
return accession_id
|
| 187 |
-
return "" # Return empty if index is out of bounds
|
| 188 |
|
| 189 |
-
# --- Main Function to Get All Info ---
|
| 190 |
def get_protein_info(uniprot_id):
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
if not uniprot_id:
|
|
|
|
|
|
|
| 195 |
url = UNIPROT_API_URL.format(accession=uniprot_id.strip().upper())
|
| 196 |
try:
|
| 197 |
response = requests.get(url); response.raise_for_status(); data = response.json()
|
|
|
|
| 198 |
acc = data.get("primaryAccession", "N/A"); id_display = data.get("uniProtkbId", "N/A")
|
| 199 |
-
|
|
|
|
| 200 |
if name == "N/A" and data.get("proteinDescription", {}).get("submissionNames"): name = data["proteinDescription"]["submissionNames"][0].get("fullName", {}).get("value", "N/A")
|
| 201 |
genes_data = data.get("genes"); gene_str = "N/A"
|
| 202 |
if genes_data:
|
| 203 |
g_list = [g.get("geneName", {}).get("value", "") for g in genes_data if g.get("geneName")]
|
| 204 |
if not g_list: g_list = [g.get("orfNames", [{}])[0].get("value", "") for g in genes_data if g.get("orfNames")]
|
| 205 |
gene_str = ", ".join(filter(None, g_list)) or "N/A"
|
| 206 |
-
org_data = data.get("organism", {}); org_sci_name = org_data.get("scientificName", "N/A")
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
if seq_val != "N/A" and length > 0:
|
| 211 |
try:
|
| 212 |
clean_seq = "".join(filter(lambda x: x in AMINO_ACID_NAMES, seq_val.upper()))
|
|
@@ -214,114 +277,91 @@ def get_protein_info(uniprot_id):
|
|
| 214 |
else: mw_str = "Invalid sequence for MW"
|
| 215 |
except: mw_str = "Error in MW calc"
|
| 216 |
comments_data = data.get("comments", []); func_comment = "N/A"
|
| 217 |
-
for c_item in comments_data:
|
| 218 |
-
if c_item.get("commentType") == "FUNCTION":
|
| 219 |
-
texts = c_item.get("texts", [])
|
| 220 |
-
if texts:
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
f"**
|
| 225 |
-
f"**
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
if aa_err: overview_md += f"\n\n**AA Freq Error:** {aa_err}"
|
| 230 |
-
elif aa_freq:
|
|
|
|
| 231 |
if img_aa: aa_plot_upd = gr.update(value=img_aa, visible=True)
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
if seq_feat and length > 0:
|
| 234 |
img_feat = plot_sequence_features(length, seq_feat)
|
| 235 |
if img_feat: feat_plot_upd = gr.update(value=img_feat, visible=True)
|
| 236 |
-
else: feat_msg = "Could not generate feature plot."
|
| 237 |
elif not seq_feat and length > 0 : feat_msg = "No relevant features found for plotting."
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
| 239 |
except requests.exceptions.HTTPError as e:
|
| 240 |
err_msg_http = f"Error: ID '{uniprot_id}' not found." if e.response.status_code == 404 else f"HTTP error: {e}"
|
| 241 |
return (err_msg_http,) + outputs_on_error[1:]
|
| 242 |
-
except Exception as e:
|
|
|
|
| 243 |
|
| 244 |
-
# --- Gradio UI Definition ---
|
| 245 |
with gr.Blocks(theme=gr.themes.Glass()) as iface:
|
| 246 |
-
gr.Markdown("# Protein Profile Viewer (v1.
|
| 247 |
-
gr.Markdown("Enter a UniProt ID
|
| 248 |
-
|
| 249 |
-
with gr.Group():
|
| 250 |
-
gr.Markdown("### Find UniProt ID by Name/Keyword")
|
| 251 |
-
search_term_input = gr.Textbox(label="Search Term (min 3 chars, type and wait)", placeholder="e.g., insulin, EGFR, P53")
|
| 252 |
-
search_status_output = gr.Markdown()
|
| 253 |
-
# Dataset to store search results data (invisible, used as state)
|
| 254 |
-
search_results_data_state = gr.State([])
|
| 255 |
-
# Dataset component for display
|
| 256 |
-
search_results_output_display = gr.Dataset(
|
| 257 |
-
label="Search Results (Select a row to copy Accession)",
|
| 258 |
-
headers=["Accession", "UniProtKB ID", "Protein Name", "Organism"],
|
| 259 |
-
samples=[], samples_per_page=5
|
| 260 |
-
)
|
| 261 |
-
selected_id_to_copy = gr.Textbox(label="Copy this Accession ID:", interactive=True, show_copy_button=True)
|
| 262 |
-
|
| 263 |
-
gr.Markdown("---")
|
| 264 |
-
|
| 265 |
-
gr.Markdown("### View Protein Profile")
|
| 266 |
with gr.Row():
|
| 267 |
-
protein_id_input = gr.Textbox(label="Enter UniProt
|
| 268 |
-
submit_button = gr.Button("
|
| 269 |
-
|
| 270 |
with gr.Tabs():
|
| 271 |
-
# ... (Tabs definition remains the same) ...
|
| 272 |
with gr.TabItem("Overview"):
|
| 273 |
-
|
| 274 |
-
|
| 275 |
with gr.TabItem("Analysis Plots"):
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
with gr.TabItem("Functional Context"):
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
| 286 |
with gr.TabItem("Publications"):
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
with gr.TabItem("Cross-references"):
|
| 290 |
-
gr.Markdown("### Database Links\nLinks to other relevant databases.")
|
| 291 |
-
xref_output = gr.Markdown()
|
| 292 |
|
| 293 |
-
# --- Event Handlers (Corrected) ---
|
| 294 |
-
search_term_input.change(
|
| 295 |
-
fn=search_uniprot_by_name,
|
| 296 |
-
inputs=search_term_input,
|
| 297 |
-
# Output both the status message and the data for the Dataset state
|
| 298 |
-
outputs=[search_status_output, search_results_data_state]
|
| 299 |
-
)
|
| 300 |
-
|
| 301 |
-
# When the state data changes, update the displayed Dataset
|
| 302 |
-
search_results_data_state.change(
|
| 303 |
-
fn=lambda data: data, # Simple function to pass data through
|
| 304 |
-
inputs=search_results_data_state,
|
| 305 |
-
outputs=search_results_output_display
|
| 306 |
-
)
|
| 307 |
-
|
| 308 |
-
# When a row is selected in the displayed Dataset, update the copy box
|
| 309 |
-
search_results_output_display.select(
|
| 310 |
-
fn=update_copy_box,
|
| 311 |
-
# Pass the *state* containing the data and the event data
|
| 312 |
-
inputs=[search_results_data_state],
|
| 313 |
-
outputs=selected_id_to_copy
|
| 314 |
-
# Removed the incorrect _js argument
|
| 315 |
-
)
|
| 316 |
-
|
| 317 |
-
# Main profile submit button action
|
| 318 |
submit_button.click(
|
| 319 |
-
fn=get_protein_info,
|
|
|
|
| 320 |
outputs=[overview_output, aa_freq_plot_output, seq_features_plot_output, seq_features_message_output,
|
| 321 |
pathways_output, interactions_output, disease_output, publications_output, xref_output]
|
| 322 |
)
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
| 325 |
|
| 326 |
if __name__ == "__main__":
|
| 327 |
iface.launch()
|
|
|
|
| 8 |
import sys
|
| 9 |
import traceback
|
| 10 |
import json
|
|
|
|
| 11 |
|
| 12 |
UNIPROT_API_URL = "https://rest.uniprot.org/uniprotkb/{accession}.json"
|
|
|
|
| 13 |
|
| 14 |
AMINO_ACID_NAMES = {
|
| 15 |
'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine', 'D': 'Aspartic acid',
|
|
|
|
| 20 |
}
|
| 21 |
STANDARD_AMINO_ACIDS_ORDER = "ARNDCQEGHILKMFPSTWYV"
|
| 22 |
|
| 23 |
+
def get_amino_acid_frequencies(sequence):
|
|
|
|
| 24 |
if not sequence or sequence == "N/A": return None, "Sequence not available for analysis."
|
| 25 |
cleaned_sequence = "".join(filter(lambda x: x in AMINO_ACID_NAMES, sequence.upper()))
|
| 26 |
+
if not cleaned_sequence: return None, "No valid amino acids found in sequence for counting."
|
| 27 |
+
counts = Counter(cleaned_sequence)
|
| 28 |
+
frequencies = {aa: counts.get(aa, 0) for aa in STANDARD_AMINO_ACIDS_ORDER}
|
| 29 |
return frequencies, None
|
| 30 |
+
|
| 31 |
+
def plot_amino_acid_frequencies(frequencies):
|
| 32 |
if not frequencies: return None
|
| 33 |
ordered_keys = [key for key in STANDARD_AMINO_ACIDS_ORDER if key in frequencies]
|
| 34 |
+
labels = [f"{aa}: {AMINO_ACID_NAMES.get(aa, aa)}" for aa in ordered_keys]
|
| 35 |
+
values = [frequencies[aa] for aa in ordered_keys]
|
| 36 |
fig, ax = plt.subplots(figsize=(12, 7)); ax.bar(labels, values, color='skyblue')
|
| 37 |
+
ax.set_xlabel("Amino Acid"); ax.set_ylabel("Frequency"); ax.set_title("Amino Acid Frequency Plot")
|
| 38 |
plt.xticks(rotation=75, ha="right", fontsize=8); plt.tight_layout()
|
| 39 |
+
buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
|
| 40 |
+
img = Image.open(buf); plt.close(fig)
|
| 41 |
+
return img
|
| 42 |
+
|
| 43 |
+
def extract_sequence_features(uniprot_data):
|
| 44 |
+
features_of_interest_uppercase = {
|
| 45 |
+
"DOMAIN": "blue", "MOTIF": "green", "ACTIVE_SITE": "red",
|
| 46 |
+
"BINDING_SITE": "orange", "MOD_RES": "purple",
|
| 47 |
+
"HELIX": "cyan", "STRAND": "magenta", "TURN": "gold"
|
| 48 |
+
}
|
| 49 |
+
extracted_features = []
|
| 50 |
+
if "features" in uniprot_data and uniprot_data["features"] is not None:
|
| 51 |
+
for feature_item in uniprot_data["features"]:
|
| 52 |
+
feature_type_raw = feature_item.get("type")
|
| 53 |
+
if not isinstance(feature_type_raw, str): continue
|
| 54 |
+
feature_type_normalized = feature_type_raw.strip().upper()
|
| 55 |
+
if feature_type_normalized in features_of_interest_uppercase:
|
| 56 |
try:
|
| 57 |
+
location_obj = feature_item.get("location", {})
|
| 58 |
+
begin_pos_val_str = None; end_pos_val_str = None
|
| 59 |
+
start_node = location_obj.get("start"); end_node = location_obj.get("end")
|
| 60 |
+
position_node = location_obj.get("position")
|
| 61 |
+
if start_node and isinstance(start_node, dict) and "value" in start_node: begin_pos_val_str = str(start_node["value"])
|
| 62 |
+
if end_node and isinstance(end_node, dict) and "value" in end_node: end_pos_val_str = str(end_node["value"])
|
| 63 |
+
if position_node and isinstance(position_node, dict) and "value" in position_node:
|
| 64 |
+
pos_val_str = str(position_node["value"])
|
| 65 |
+
if begin_pos_val_str is None: begin_pos_val_str = pos_val_str
|
| 66 |
+
if end_pos_val_str is None: end_pos_val_str = pos_val_str
|
| 67 |
+
if "start" not in location_obj and "end" not in location_obj:
|
| 68 |
+
begin_pos_val_str = pos_val_str; end_pos_val_str = pos_val_str
|
| 69 |
+
if begin_pos_val_str is None or end_pos_val_str is None: continue
|
| 70 |
+
begin_pos = int(begin_pos_val_str); end_pos = int(end_pos_val_str)
|
| 71 |
+
if begin_pos > end_pos: continue
|
| 72 |
+
extracted_features.append({
|
| 73 |
+
"type": feature_type_raw, "begin": begin_pos, "end": end_pos,
|
| 74 |
+
"description": feature_item.get("description", feature_type_raw),
|
| 75 |
+
"color": features_of_interest_uppercase[feature_type_normalized]
|
| 76 |
+
})
|
| 77 |
+
except (ValueError, TypeError, AttributeError): continue
|
| 78 |
+
return extracted_features
|
| 79 |
+
|
| 80 |
+
def plot_sequence_features(sequence_length, features):
|
| 81 |
if not features or sequence_length == 0: return None
|
| 82 |
fig, ax = plt.subplots(figsize=(12, max(3, len(features) * 0.4) + 1.5))
|
| 83 |
+
ax.set_xlim(0, sequence_length); ax.set_xlabel("Amino Acid Position"); ax.set_yticks([])
|
| 84 |
+
ax.set_title("Sequence Features Plot"); legend_handles = {}
|
| 85 |
+
y_pos_counter = 0; plotted_feature_types_in_legend = set(); bar_height = 0.8
|
| 86 |
+
for feature in sorted(features, key=lambda x: x["begin"]):
|
| 87 |
+
begin = feature["begin"]; end = feature["end"]; color = feature["color"]
|
| 88 |
+
width = max(1, end - begin + 1)
|
| 89 |
+
ax.barh(y_pos_counter, width, height=bar_height, left=begin -1, color=color, edgecolor='black', alpha=0.7)
|
| 90 |
+
if feature['type'] not in plotted_feature_types_in_legend:
|
| 91 |
+
legend_handles[feature['type']] = plt.Rectangle((0, 0), 1, 1, fc=color, alpha=0.7)
|
| 92 |
+
plotted_feature_types_in_legend.add(feature['type'])
|
| 93 |
+
y_pos_counter += 1
|
| 94 |
+
if y_pos_counter > 0: ax.set_ylim(-0.5, y_pos_counter -1 + bar_height/2 + 0.5)
|
| 95 |
else: plt.close(fig); return None
|
| 96 |
+
if legend_handles: ax.legend(legend_handles.values(), legend_handles.keys(), title="Feature Types", bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
|
| 97 |
plt.tight_layout(rect=[0, 0, 0.83, 0.96])
|
| 98 |
+
buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0)
|
| 99 |
+
img = Image.open(buf); plt.close(fig)
|
| 100 |
+
return img
|
| 101 |
+
|
| 102 |
+
def extract_interactions(uniprot_data):
|
| 103 |
+
interactions_list = []
|
| 104 |
if "comments" in uniprot_data:
|
| 105 |
+
for comment in uniprot_data["comments"]:
|
| 106 |
+
if comment.get("commentType") == "INTERACTION" and "interactions" in comment:
|
| 107 |
+
for interaction_entry in comment["interactions"]:
|
| 108 |
+
interactant_one_acc = interaction_entry.get("interactantOne", {}).get("uniProtKBAccession")
|
| 109 |
+
interactant_two_acc = interaction_entry.get("interactantTwo", {}).get("uniProtKBAccession")
|
| 110 |
+
interactant_two_gene = interaction_entry.get("interactantTwo", {}).get("geneName")
|
| 111 |
+
if interactant_one_acc == uniprot_data.get("primaryAccession") and interactant_two_acc:
|
| 112 |
+
partner_display_name = interactant_two_acc
|
| 113 |
+
if interactant_two_gene:
|
| 114 |
+
partner_display_name = f"{interactant_two_gene} ({interactant_two_acc})"
|
| 115 |
+
interactions_list.append(f"- Interacts with: **{partner_display_name}**")
|
| 116 |
+
if not interactions_list:
|
| 117 |
+
return "No specific interaction partners listed in UniProt comments."
|
| 118 |
+
# Sort alphabetically for consistency
|
| 119 |
+
return "\n".join(sorted(interactions_list))
|
| 120 |
+
|
| 121 |
+
def extract_pathways(uniprot_data):
|
| 122 |
+
pathways = []
|
| 123 |
+
pathway_databases = { "KEGG": "https://www.genome.jp/dbget-bin/www_bget?", "Reactome": "https://reactome.org/content/detail/" }
|
| 124 |
if "uniProtKBCrossReferences" in uniprot_data:
|
| 125 |
for xref in uniprot_data["uniProtKBCrossReferences"]:
|
| 126 |
+
db_name = xref.get("database")
|
| 127 |
+
if db_name in pathway_databases:
|
| 128 |
+
pathway_id = xref.get("id"); pathway_description = ""
|
| 129 |
if "properties" in xref:
|
| 130 |
+
for prop in xref["properties"]:
|
| 131 |
+
if prop.get("key") == "PathwayName" or prop.get("key") == "Description":
|
| 132 |
+
pathway_description = prop.get("value"); break
|
| 133 |
+
if pathway_id:
|
| 134 |
+
link = pathway_databases[db_name] + pathway_id
|
| 135 |
+
display_text = f"{pathway_description} ({pathway_id})" if pathway_description else pathway_id
|
| 136 |
+
pathways.append(f"- [{display_text}]({link}) ({db_name})")
|
| 137 |
+
if not pathways: return "No pathway information found in KEGG or Reactome cross-references."
|
| 138 |
+
return "\n".join(sorted(list(set(pathways))))
|
| 139 |
+
|
| 140 |
+
def extract_disease_info(uniprot_data):
|
| 141 |
+
disease_info_list = []
|
| 142 |
if "comments" in uniprot_data:
|
| 143 |
+
for comment in uniprot_data["comments"]:
|
| 144 |
+
if comment.get("commentType") == "DISEASE" and "disease" in comment:
|
| 145 |
+
disease_entry = comment["disease"]
|
| 146 |
+
disease_name = disease_entry.get("diseaseId", "Unknown disease")
|
| 147 |
+
description = disease_entry.get("description", "No description available.")
|
| 148 |
+
mim_id = None
|
| 149 |
+
if "diseaseCrossReference" in disease_entry and disease_entry["diseaseCrossReference"].get("database") == "MIM":
|
| 150 |
+
mim_id = disease_entry["diseaseCrossReference"].get("id")
|
| 151 |
+
disease_md = f"**{disease_name}**"
|
| 152 |
+
if mim_id: disease_md += f" (MIM: [{mim_id}](https://www.omim.org/entry/{mim_id}))"
|
| 153 |
+
disease_md += f"\n - *Description:* {description}\n"
|
| 154 |
+
if "note" in comment and "texts" in comment["note"]:
|
| 155 |
+
for note_text_obj in comment["note"]["texts"]:
|
| 156 |
+
note_val = note_text_obj.get("value")
|
| 157 |
+
if note_val: disease_md += f" - *Note:* {note_val}\n"
|
| 158 |
+
disease_info_list.append(disease_md)
|
| 159 |
+
if not disease_info_list: return "No specific disease association information found in UniProt comments."
|
| 160 |
+
return "\n---\n".join(sorted(disease_info_list)) # Sort alphabetically by disease name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
+
def extract_publications(uniprot_data):
|
| 163 |
+
publications_list = []
|
| 164 |
+
if "references" in uniprot_data:
|
| 165 |
+
for ref_idx, ref in enumerate(uniprot_data.get("references", [])):
|
| 166 |
+
citation = ref.get("citation", {})
|
| 167 |
+
title = citation.get("title", "N/A")
|
| 168 |
+
authors = ", ".join(citation.get("authors", ["N/A"]))
|
| 169 |
+
journal = citation.get("journalName", "N/A")
|
| 170 |
+
volume = citation.get("volume", "")
|
| 171 |
+
first_page = citation.get("firstPage", "")
|
| 172 |
+
last_page = citation.get("lastPage", "")
|
| 173 |
+
publication_date = citation.get("publicationDate", "")
|
| 174 |
+
pubmed_id = None; doi_id = None
|
| 175 |
+
if "citationCrossReferences" in citation:
|
| 176 |
+
for xref_cite in citation["citationCrossReferences"]:
|
| 177 |
+
if xref_cite.get("database") == "PubMed": pubmed_id = xref_cite.get("id")
|
| 178 |
+
elif xref_cite.get("database") == "DOI": doi_id = xref_cite.get("id")
|
| 179 |
+
pub_md = f"**{ref_idx + 1}. Title:** {title}\n"
|
| 180 |
+
pub_md += f" - *Authors:* {authors}\n"
|
| 181 |
+
pub_md += f" - *Journal:* {journal}"
|
| 182 |
+
if volume: pub_md += f", Vol. {volume}"
|
| 183 |
+
if first_page: pub_md += f", pp. {first_page}"
|
| 184 |
+
if last_page: pub_md += f"-{last_page}"
|
| 185 |
+
if publication_date: pub_md += f" ({publication_date})"
|
| 186 |
+
pub_md += "\n"
|
| 187 |
+
if pubmed_id: pub_md += f" - *PubMed:* [{pubmed_id}](https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}/)\n"
|
| 188 |
+
if doi_id: pub_md += f" - *DOI:* [{doi_id}](https://doi.org/{doi_id})\n"
|
| 189 |
+
publications_list.append(pub_md)
|
| 190 |
+
if not publications_list: return "No publication information found in this UniProt entry."
|
| 191 |
+
return "\n---\n".join(publications_list)
|
| 192 |
|
| 193 |
+
def extract_cross_references(uniprot_data):
|
| 194 |
+
xref_list = []
|
| 195 |
+
target_databases = {
|
| 196 |
+
"Ensembl": "https://www.ensembl.org/id/", "GeneID": "https://www.ncbi.nlm.nih.gov/gene/",
|
| 197 |
+
"RefSeq": "https://www.ncbi.nlm.nih.gov/nuccore/", "GO": "https://amigo.geneontology.org/amigo/term/",
|
| 198 |
+
"InterPro": "https://www.ebi.ac.uk/interpro/entry/InterPro/",
|
| 199 |
+
"Pfam": "https://www.ebi.ac.uk/interpro/entry/pfam/",
|
| 200 |
+
"PDB": "https://www.rcsb.org/structure/",
|
| 201 |
+
"KEGG": "https://www.genome.jp/dbget-bin/www_bget?", # Add KEGG gene link
|
| 202 |
+
"Reactome": "https://reactome.org/content/detail/" # Reactome protein link (usually same as pathway)
|
| 203 |
+
}
|
| 204 |
+
grouped_xrefs = {db: [] for db in target_databases}
|
| 205 |
+
if "uniProtKBCrossReferences" in uniprot_data:
|
| 206 |
+
for xref in uniprot_data["uniProtKBCrossReferences"]:
|
| 207 |
+
db_name = xref.get("database")
|
| 208 |
+
if db_name in target_databases:
|
| 209 |
+
xref_id = xref.get("id"); link_url = None; display_text = xref_id
|
| 210 |
+
if db_name == "Ensembl" and "properties" in xref:
|
| 211 |
+
for prop in xref["properties"]:
|
| 212 |
+
if prop.get("key") == "GeneId":
|
| 213 |
+
ensembl_gene_id = prop.get("value")
|
| 214 |
+
link_url = f"https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={ensembl_gene_id}"
|
| 215 |
+
display_text = ensembl_gene_id; break
|
| 216 |
+
if link_url is None:
|
| 217 |
+
for prop in xref["properties"]:
|
| 218 |
+
if prop.get("key") == "ProteinId":
|
| 219 |
+
ensembl_prot_id = prop.get("value")
|
| 220 |
+
link_url = f"https://www.ensembl.org/Homo_sapiens/Transcript/Summary?p={ensembl_prot_id}"
|
| 221 |
+
display_text = ensembl_prot_id; break
|
| 222 |
+
elif db_name == "RefSeq" and "properties" in xref:
|
| 223 |
+
for prop in xref["properties"]:
|
| 224 |
+
if prop.get("key") == "ProteinId" or prop.get("key") == "NucleotideSequenceId":
|
| 225 |
+
refseq_id = prop.get("value")
|
| 226 |
+
link_url = target_databases[db_name] + refseq_id
|
| 227 |
+
display_text = refseq_id; break
|
| 228 |
+
elif db_name == "GO" and xref_id:
|
| 229 |
+
term_name = xref_id
|
| 230 |
+
if "properties" in xref:
|
| 231 |
+
for prop in xref["properties"]:
|
| 232 |
+
if prop.get("key") == "GoTerm": term_name = prop.get("value"); break
|
| 233 |
+
link_url = target_databases[db_name] + xref_id; display_text = f"{term_name} ({xref_id})"
|
| 234 |
+
elif db_name == "Pfam" and xref_id:
|
| 235 |
+
link_url = f"https://www.ebi.ac.uk/interpro/entry/pfam/{xref_id}"
|
| 236 |
+
elif xref_id: link_url = target_databases[db_name] + xref_id
|
| 237 |
+
|
| 238 |
+
if link_url: grouped_xrefs[db_name].append(f"[{display_text}]({link_url})")
|
| 239 |
|
| 240 |
+
for db_name, links in grouped_xrefs.items():
|
| 241 |
+
if links: xref_list.append(f"**{db_name}:** " + ", ".join(sorted(list(set(links)))))
|
| 242 |
+
if not xref_list: return "No cross-references found for the selected databases."
|
| 243 |
+
return "\n".join(xref_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
|
|
|
| 245 |
def get_protein_info(uniprot_id):
|
| 246 |
+
empty_plot = gr.update(value=None, visible=False); empty_str = ""; err_msg_default = "Error."
|
| 247 |
+
outputs_on_error = (err_msg_default, empty_plot, empty_plot, empty_str,
|
| 248 |
+
empty_str, empty_str, empty_str, empty_str, empty_str)
|
| 249 |
+
if not uniprot_id:
|
| 250 |
+
return ("Please enter a UniProt ID.",) + outputs_on_error[1:]
|
| 251 |
+
|
| 252 |
url = UNIPROT_API_URL.format(accession=uniprot_id.strip().upper())
|
| 253 |
try:
|
| 254 |
response = requests.get(url); response.raise_for_status(); data = response.json()
|
| 255 |
+
|
| 256 |
acc = data.get("primaryAccession", "N/A"); id_display = data.get("uniProtkbId", "N/A")
|
| 257 |
+
uniprot_link = f"https://www.uniprot.org/uniprotkb/{acc}/entry"
|
| 258 |
+
name_dict = data.get("proteinDescription", {}).get("recommendedName", {}); name = name_dict.get("fullName", {}).get("value", "N/A")
|
| 259 |
if name == "N/A" and data.get("proteinDescription", {}).get("submissionNames"): name = data["proteinDescription"]["submissionNames"][0].get("fullName", {}).get("value", "N/A")
|
| 260 |
genes_data = data.get("genes"); gene_str = "N/A"
|
| 261 |
if genes_data:
|
| 262 |
g_list = [g.get("geneName", {}).get("value", "") for g in genes_data if g.get("geneName")]
|
| 263 |
if not g_list: g_list = [g.get("orfNames", [{}])[0].get("value", "") for g in genes_data if g.get("orfNames")]
|
| 264 |
gene_str = ", ".join(filter(None, g_list)) or "N/A"
|
| 265 |
+
org_data = data.get("organism", {}); org_sci_name = org_data.get("scientificName", "N/A")
|
| 266 |
+
org_common_name = org_data.get("commonName", "")
|
| 267 |
+
org_display = f"{org_sci_name}" + (f" ({org_common_name})" if org_common_name else "")
|
| 268 |
+
seq_info = data.get("sequence", {}); seq_val = seq_info.get("value", "N/A"); length = seq_info.get("length", 0)
|
| 269 |
+
status_val = data.get("entryAudit", {}).get("entryType", "N/A").replace("UniProtKB ", "")
|
| 270 |
+
existence_val = data.get("proteinExistence", "N/A").replace(": Evidence at ", ": ")
|
| 271 |
+
score_val = data.get("annotationScore", "N/A")
|
| 272 |
+
mw_str = "N/A"
|
| 273 |
if seq_val != "N/A" and length > 0:
|
| 274 |
try:
|
| 275 |
clean_seq = "".join(filter(lambda x: x in AMINO_ACID_NAMES, seq_val.upper()))
|
|
|
|
| 277 |
else: mw_str = "Invalid sequence for MW"
|
| 278 |
except: mw_str = "Error in MW calc"
|
| 279 |
comments_data = data.get("comments", []); func_comment = "N/A"
|
| 280 |
+
for c_item in comments_data:
|
| 281 |
+
if c_item.get("commentType") == "FUNCTION":
|
| 282 |
+
texts = c_item.get("texts", [])
|
| 283 |
+
if texts: func_comment = texts[0].get("value", "N/A"); break
|
| 284 |
+
|
| 285 |
+
overview_md = (f"## {id_display} ({acc})\n"
|
| 286 |
+
f"[{acc} on UniProt]({uniprot_link})\n\n"
|
| 287 |
+
f"**Protein:** {name}\n**Gene:** {gene_str}\n**Status:** {status_val}\n"
|
| 288 |
+
f"**Organism:** {org_display}\n**Length:** {length} aa\n"
|
| 289 |
+
f"**Existence:** {existence_val}\n**Score:** {score_val}/5\n**Calc. MW:** {mw_str}\n\n"
|
| 290 |
+
f"**Function Snippet:**\n{func_comment}\n\n"
|
| 291 |
+
f"**Sequence (first 100 aa):**\n`{seq_val[:100]}{'...' if len(seq_val) > 100 else ''}`\n\n"
|
| 292 |
+
f"--- \n*More details in other tabs.*")
|
| 293 |
+
|
| 294 |
+
interactions_md = extract_interactions(data)
|
| 295 |
+
pathways_md = extract_pathways(data)
|
| 296 |
+
disease_md = extract_disease_info(data)
|
| 297 |
+
publications_md = extract_publications(data)
|
| 298 |
+
xref_md = extract_cross_references(data)
|
| 299 |
+
|
| 300 |
+
aa_freq, aa_err = get_amino_acid_frequencies(seq_val)
|
| 301 |
+
aa_plot_upd = empty_plot
|
| 302 |
if aa_err: overview_md += f"\n\n**AA Freq Error:** {aa_err}"
|
| 303 |
+
elif aa_freq:
|
| 304 |
+
img_aa = plot_amino_acid_frequencies(aa_freq)
|
| 305 |
if img_aa: aa_plot_upd = gr.update(value=img_aa, visible=True)
|
| 306 |
+
|
| 307 |
+
seq_feat = extract_sequence_features(data)
|
| 308 |
+
feat_plot_upd = empty_plot; feat_msg = ""
|
| 309 |
if seq_feat and length > 0:
|
| 310 |
img_feat = plot_sequence_features(length, seq_feat)
|
| 311 |
if img_feat: feat_plot_upd = gr.update(value=img_feat, visible=True)
|
| 312 |
+
else: feat_msg = "Could not generate sequence feature plot."
|
| 313 |
elif not seq_feat and length > 0 : feat_msg = "No relevant features found for plotting."
|
| 314 |
+
|
| 315 |
+
return (overview_md, aa_plot_upd, feat_plot_upd, feat_msg,
|
| 316 |
+
pathways_md, interactions_md, disease_md, publications_md, xref_md)
|
| 317 |
+
|
| 318 |
except requests.exceptions.HTTPError as e:
|
| 319 |
err_msg_http = f"Error: ID '{uniprot_id}' not found." if e.response.status_code == 404 else f"HTTP error: {e}"
|
| 320 |
return (err_msg_http,) + outputs_on_error[1:]
|
| 321 |
+
except Exception as e:
|
| 322 |
+
return (f"Error: {str(e)[:150]}",) + outputs_on_error[1:]
|
| 323 |
|
|
|
|
| 324 |
with gr.Blocks(theme=gr.themes.Glass()) as iface:
|
| 325 |
+
gr.Markdown("# Protein Profile Viewer (v1.4 - Cross-references)")
|
| 326 |
+
gr.Markdown("Enter a UniProt ID to explore its details including overview, sequence analysis, functional context, publications, and links to other databases.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
with gr.Row():
|
| 328 |
+
protein_id_input = gr.Textbox(label="Enter UniProt ID", placeholder="e.g., P00533", scale=3)
|
| 329 |
+
submit_button = gr.Button("Submit", scale=1, variant="primary")
|
|
|
|
| 330 |
with gr.Tabs():
|
|
|
|
| 331 |
with gr.TabItem("Overview"):
|
| 332 |
+
gr.Markdown("### Protein Overview\nKey information about the protein, including its UniProt ID, name, gene, organism, length, function snippet, and a link to the full UniProt entry. The first 100 amino acids of the sequence are also displayed here.")
|
| 333 |
+
overview_output = gr.Markdown()
|
| 334 |
with gr.TabItem("Analysis Plots"):
|
| 335 |
+
gr.Markdown("### Sequence Analysis Visualizations\nGraphical representations of amino acid composition and annotated sequence features.")
|
| 336 |
+
with gr.Column():
|
| 337 |
+
aa_freq_plot_output = gr.Image(label="Amino Acid Frequency Plot", type="pil", show_label=True, visible=False)
|
| 338 |
+
seq_features_plot_output = gr.Image(label="Sequence Features Plot", type="pil", show_label=True, visible=False)
|
| 339 |
+
seq_features_message_output = gr.Markdown()
|
| 340 |
with gr.TabItem("Functional Context"):
|
| 341 |
+
gr.Markdown("### Pathways, Interactions & Disease\nBiological context: pathways, interaction partners, and associated diseases.")
|
| 342 |
+
with gr.Accordion("Biological Pathways (KEGG, Reactome)", open=False):
|
| 343 |
+
pathways_output = gr.Markdown()
|
| 344 |
+
with gr.Accordion("Protein Interactions", open=False):
|
| 345 |
+
interactions_output = gr.Markdown()
|
| 346 |
+
with gr.Accordion("Disease Associations", open=False):
|
| 347 |
+
disease_output = gr.Markdown()
|
| 348 |
with gr.TabItem("Publications"):
|
| 349 |
+
gr.Markdown("### Relevant Publications\nA list of scientific publications from UniProt.")
|
| 350 |
+
publications_output = gr.Markdown()
|
| 351 |
+
with gr.TabItem("Cross-references"): # New Tab
|
| 352 |
+
gr.Markdown("### Database Links\nLinks to this protein's entry in other relevant biological databases (e.g., Ensembl, RefSeq, GO, PDB).")
|
| 353 |
+
xref_output = gr.Markdown() # New output component
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
submit_button.click(
|
| 356 |
+
fn=get_protein_info,
|
| 357 |
+
inputs=protein_id_input,
|
| 358 |
outputs=[overview_output, aa_freq_plot_output, seq_features_plot_output, seq_features_message_output,
|
| 359 |
pathways_output, interactions_output, disease_output, publications_output, xref_output]
|
| 360 |
)
|
| 361 |
+
gr.Examples(
|
| 362 |
+
examples=[["P05067"], ["P00533"], ["Q9BYF1"], ["P0DP23"], ["P04637"]],
|
| 363 |
+
inputs=protein_id_input
|
| 364 |
+
)
|
| 365 |
|
| 366 |
if __name__ == "__main__":
|
| 367 |
iface.launch()
|