Spaces:
Sleeping
Sleeping
| import requests | |
| import json | |
| import time | |
| import urllib.parse | |
| def fetch_protein_info(protein_name): | |
| url = "https://rest.uniprot.org/uniprotkb/search" | |
| params = { | |
| "query": protein_name, | |
| "format": "json", | |
| "fields": "accession,id,protein_name,gene_names,organism_name,reviewed", | |
| "size": 10 # Increase size to get more results | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| response.raise_for_status() | |
| data = response.json() | |
| if data.get('results'): | |
| # Try to find an exact match for the gene name first | |
| for result in data['results']: | |
| gene_names = result.get('genes', []) | |
| if gene_names and any(gene.get('geneName', {}).get('value') == protein_name for gene in gene_names): | |
| print("Exact gene match found:", result) | |
| return process_result(result) | |
| # If no exact match, return the first reviewed (Swiss-Prot) entry or the first result | |
| reviewed_result = next((r for r in data['results'] if r.get('entryType') == 'UniProtKB reviewed (Swiss-Prot)'), None) | |
| if reviewed_result: | |
| print("Reviewed entry found:", reviewed_result) | |
| return process_result(reviewed_result) | |
| else: | |
| print("Using first result:", data['results'][0]) | |
| return process_result(data['results'][0]) | |
| else: | |
| print(f"No results found for '{protein_name}'") | |
| return None, None | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error occurred while fetching data: {e}") | |
| return None, None | |
| def process_result(result): | |
| primary_accession = result.get('primaryAccession') | |
| name = result.get('proteinName', [{}])[0].get('fullName', {}).get('value') | |
| if not name: | |
| name = result.get('proteinName', [{}])[0].get('shortName', [{}])[0].get('value') | |
| if not name: | |
| name = result.get('id') | |
| return primary_accession, name | |
| def fetch_uniprot_info(accession, email): | |
| uniprot_base_url = "https://rest.uniprot.org/uniprotkb/" | |
| headers = { | |
| "Accept": "application/json", | |
| "User-Agent": f"Python script (mailto:{email})" | |
| } | |
| try: | |
| response = requests.get(f"{uniprot_base_url}{accession}", headers=headers) | |
| response.raise_for_status() | |
| uniprot_data = response.json() | |
| protein_info = { | |
| "accession": accession, | |
| "entry_type": uniprot_data.get('entryType'), | |
| "entry_name": uniprot_data.get('uniProtkbId'), | |
| "protein_name": uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value'), | |
| "gene_name": next((gene.get('geneName', {}).get('value') for gene in uniprot_data.get('genes', []) if gene.get('geneName')), None), | |
| "organism": uniprot_data.get('organism', {}).get('scientificName'), | |
| "sequence": uniprot_data.get('sequence', {}).get('value'), | |
| "sequence_length": uniprot_data.get('sequence', {}).get('length'), | |
| "function": next((comment.get('texts', [{}])[0].get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'FUNCTION'), None), | |
| "subcellular_locations": [ | |
| loc.get('location', {}).get('value') | |
| for comment in uniprot_data.get('comments', []) | |
| if comment.get('commentType') == 'SUBCELLULAR LOCATION' | |
| for loc in comment.get('subcellularLocations', []) | |
| ], | |
| "ec_numbers": [ec.get('value') for ec in uniprot_data.get('proteinDescription', {}).get('ecNumbers', [])], | |
| "keywords": [kw.get('name') for kw in uniprot_data.get('keywords', [])], | |
| "features": [{'type': f.get('type'), 'description': f.get('description')} for f in uniprot_data.get('features', [])] | |
| } | |
| return protein_info | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching UniProt data: {e}") | |
| return None | |
| def fetch_comprehensive_interpro_info(accession, email): | |
| base_url = "https://www.ebi.ac.uk/interpro/api/protein/uniprot/" | |
| protein_url = f"{base_url}{accession}/entry_protein_locations/" | |
| headers = { | |
| "Accept": "application/json", | |
| "User-Agent": f"Python script (mailto:{email})" | |
| } | |
| try: | |
| response = requests.get(protein_url, headers=headers) | |
| response.raise_for_status() | |
| interpro_data = response.json() | |
| return interpro_data | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching InterPro data: {e}") | |
| return None | |
| def fetch_pdb_info(accession, email): | |
| pdb_search_url = "https://search.rcsb.org/rcsbsearch/v2/query" | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Accept": "application/json", | |
| "User-Agent": f"Python script (mailto:{email})" | |
| } | |
| # Construct the search query | |
| query = { | |
| "query": { | |
| "type": "terminal", | |
| "service": "text", | |
| "parameters": { | |
| "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", | |
| "operator": "exact_match", | |
| "value": accession | |
| } | |
| }, | |
| "return_type": "entry", | |
| "request_options": { | |
| "return_all_hits": True | |
| } | |
| } | |
| try: | |
| # Perform the search | |
| response = requests.post(pdb_search_url, headers=headers, data=json.dumps(query)) | |
| response.raise_for_status() | |
| search_results = response.json() | |
| pdb_ids = [result['identifier'] for result in search_results.get('result_set', [])] | |
| if not pdb_ids: | |
| # No PDB entries found | |
| return { | |
| "message": "Protein not found in PDB.", | |
| "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}" | |
| } | |
| pdb_info_list = [] | |
| for pdb_id in pdb_ids: | |
| time.sleep(0.1) # Be polite to the API | |
| pdb_entry_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" | |
| response = requests.get(pdb_entry_url, headers=headers) | |
| response.raise_for_status() | |
| pdb_data = response.json() | |
| pdb_info = { | |
| "pdb_id": pdb_id, | |
| "title": pdb_data.get('struct', {}).get('title'), | |
| "deposition_date": pdb_data.get('rcsb_accession_info', {}).get('deposit_date'), | |
| "release_date": pdb_data.get('rcsb_accession_info', {}).get('initial_release_date'), | |
| "experimental_method": pdb_data.get('exptl', [{}])[0].get('method'), | |
| "resolution": pdb_data.get('rcsb_entry_info', {}).get('resolution_combined', [None])[0], | |
| "authors": [author.get("name") for author in pdb_data.get("audit_author", [])], | |
| "ligands": [], | |
| "pdb_structure_link": f"https://www.rcsb.org/3d-view/{pdb_id}" | |
| } | |
| # Fetch ligand information | |
| ligand_entities = pdb_data.get('nonpolymer_entities', []) | |
| for ligand in ligand_entities: | |
| chem_comp = ligand.get('chem_comp', {}) | |
| ligand_info = { | |
| "chem_comp_id": chem_comp.get('id'), | |
| "name": chem_comp.get('name'), | |
| "formula": chem_comp.get('formula'), | |
| "weight": chem_comp.get('formula_weight') | |
| } | |
| pdb_info['ligands'].append(ligand_info) | |
| pdb_info_list.append(pdb_info) | |
| return {"pdb_entries": pdb_info_list} | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching PDB data: {e}") | |
| return { | |
| "message": "Error fetching PDB data.", | |
| "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}" | |
| } | |
| def fetch_protein_go_terms(uniprot_id, email): | |
| base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search" | |
| headers = { | |
| "Accept": "application/json", | |
| "User-Agent": f"Python script (mailto:{email})" | |
| } | |
| params = { | |
| "geneProductId": uniprot_id, | |
| "limit": 10 # Limit to top 10 GO terms | |
| } | |
| try: | |
| response = requests.get(base_url, params=params, headers=headers) | |
| response.raise_for_status() | |
| data = response.json() | |
| go_terms = [] | |
| for annotation in data.get('results', []): | |
| go_terms.append({ | |
| "id": annotation.get('goId'), | |
| "term": annotation.get('goName'), | |
| "aspect": annotation.get('goAspect'), | |
| "evidence": annotation.get('goEvidence'), | |
| "reference": annotation.get('reference') | |
| }) | |
| return go_terms | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching GO terms for {uniprot_id}: {str(e)}") | |
| return [] | |
| def main(): | |
| email = "your_email@example.com" # Replace with your actual email | |
| protein_name = input("What protein would you like to know about? ") | |
| print(f"\nFetching information for: {protein_name}") | |
| accession, full_name = fetch_protein_info(protein_name) | |
| if not accession: | |
| print(f"No results found for '{protein_name}'") | |
| return | |
| print(f"Protein: {full_name}") | |
| print(f"Accession: {accession}") | |
| all_data = { | |
| "uniprot": fetch_uniprot_info(accession, email), | |
| "interpro": fetch_comprehensive_interpro_info(accession, email), | |
| "pdb": fetch_pdb_info(accession, email), | |
| "go_terms": fetch_protein_go_terms(accession, email) | |
| } | |
| # Save the data to a JSON file | |
| filename = f"{accession}_comprehensive_info.json" | |
| with open(filename, 'w') as f: | |
| json.dump(all_data, f, indent=2) | |
| print(f"\nComprehensive information has been saved to {filename}") | |
| if __name__ == "__main__": | |
| main() | |