import requests import json import time import urllib.parse def fetch_protein_info(protein_name): url = "https://rest.uniprot.org/uniprotkb/search" params = { "query": protein_name, "format": "json", "fields": "accession,id,protein_name,gene_names,organism_name,reviewed", "size": 10 # Increase size to get more results } try: response = requests.get(url, params=params) response.raise_for_status() data = response.json() if data.get('results'): # Try to find an exact match for the gene name first for result in data['results']: gene_names = result.get('genes', []) if gene_names and any(gene.get('geneName', {}).get('value') == protein_name for gene in gene_names): print("Exact gene match found:", result) return process_result(result) # If no exact match, return the first reviewed (Swiss-Prot) entry or the first result reviewed_result = next((r for r in data['results'] if r.get('entryType') == 'UniProtKB reviewed (Swiss-Prot)'), None) if reviewed_result: print("Reviewed entry found:", reviewed_result) return process_result(reviewed_result) else: print("Using first result:", data['results'][0]) return process_result(data['results'][0]) else: print(f"No results found for '{protein_name}'") return None, None except requests.exceptions.RequestException as e: print(f"Error occurred while fetching data: {e}") return None, None def process_result(result): primary_accession = result.get('primaryAccession') name = result.get('proteinName', [{}])[0].get('fullName', {}).get('value') if not name: name = result.get('proteinName', [{}])[0].get('shortName', [{}])[0].get('value') if not name: name = result.get('id') return primary_accession, name def fetch_uniprot_info(accession, email): uniprot_base_url = "https://rest.uniprot.org/uniprotkb/" headers = { "Accept": "application/json", "User-Agent": f"Python script (mailto:{email})" } try: response = requests.get(f"{uniprot_base_url}{accession}", headers=headers) response.raise_for_status() uniprot_data = response.json() protein_info = { "accession": accession, "entry_type": uniprot_data.get('entryType'), "entry_name": uniprot_data.get('uniProtkbId'), "protein_name": uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value'), "gene_name": next((gene.get('geneName', {}).get('value') for gene in uniprot_data.get('genes', []) if gene.get('geneName')), None), "organism": uniprot_data.get('organism', {}).get('scientificName'), "sequence": uniprot_data.get('sequence', {}).get('value'), "sequence_length": uniprot_data.get('sequence', {}).get('length'), "function": next((comment.get('texts', [{}])[0].get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'FUNCTION'), None), "subcellular_locations": [ loc.get('location', {}).get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'SUBCELLULAR LOCATION' for loc in comment.get('subcellularLocations', []) ], "ec_numbers": [ec.get('value') for ec in uniprot_data.get('proteinDescription', {}).get('ecNumbers', [])], "keywords": [kw.get('name') for kw in uniprot_data.get('keywords', [])], "features": [{'type': f.get('type'), 'description': f.get('description')} for f in uniprot_data.get('features', [])] } return protein_info except requests.exceptions.RequestException as e: print(f"Error fetching UniProt data: {e}") return None def fetch_comprehensive_interpro_info(accession, email): base_url = "https://www.ebi.ac.uk/interpro/api/protein/uniprot/" protein_url = f"{base_url}{accession}/entry_protein_locations/" headers = { "Accept": "application/json", "User-Agent": f"Python script (mailto:{email})" } try: response = requests.get(protein_url, headers=headers) response.raise_for_status() interpro_data = response.json() return interpro_data except requests.exceptions.RequestException as e: print(f"Error fetching InterPro data: {e}") return None def fetch_pdb_info(accession, email): pdb_search_url = "https://search.rcsb.org/rcsbsearch/v2/query" headers = { "Content-Type": "application/json", "Accept": "application/json", "User-Agent": f"Python script (mailto:{email})" } # Construct the search query query = { "query": { "type": "terminal", "service": "text", "parameters": { "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", "operator": "exact_match", "value": accession } }, "return_type": "entry", "request_options": { "return_all_hits": True } } try: # Perform the search response = requests.post(pdb_search_url, headers=headers, data=json.dumps(query)) response.raise_for_status() search_results = response.json() pdb_ids = [result['identifier'] for result in search_results.get('result_set', [])] if not pdb_ids: # No PDB entries found return { "message": "Protein not found in PDB.", "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}" } pdb_info_list = [] for pdb_id in pdb_ids: time.sleep(0.1) # Be polite to the API pdb_entry_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" response = requests.get(pdb_entry_url, headers=headers) response.raise_for_status() pdb_data = response.json() pdb_info = { "pdb_id": pdb_id, "title": pdb_data.get('struct', {}).get('title'), "deposition_date": pdb_data.get('rcsb_accession_info', {}).get('deposit_date'), "release_date": pdb_data.get('rcsb_accession_info', {}).get('initial_release_date'), "experimental_method": pdb_data.get('exptl', [{}])[0].get('method'), "resolution": pdb_data.get('rcsb_entry_info', {}).get('resolution_combined', [None])[0], "authors": [author.get("name") for author in pdb_data.get("audit_author", [])], "ligands": [], "pdb_structure_link": f"https://www.rcsb.org/3d-view/{pdb_id}" } # Fetch ligand information ligand_entities = pdb_data.get('nonpolymer_entities', []) for ligand in ligand_entities: chem_comp = ligand.get('chem_comp', {}) ligand_info = { "chem_comp_id": chem_comp.get('id'), "name": chem_comp.get('name'), "formula": chem_comp.get('formula'), "weight": chem_comp.get('formula_weight') } pdb_info['ligands'].append(ligand_info) pdb_info_list.append(pdb_info) return {"pdb_entries": pdb_info_list} except requests.exceptions.RequestException as e: print(f"Error fetching PDB data: {e}") return { "message": "Error fetching PDB data.", "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}" } def fetch_protein_go_terms(uniprot_id, email): base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search" headers = { "Accept": "application/json", "User-Agent": f"Python script (mailto:{email})" } params = { "geneProductId": uniprot_id, "limit": 10 # Limit to top 10 GO terms } try: response = requests.get(base_url, params=params, headers=headers) response.raise_for_status() data = response.json() go_terms = [] for annotation in data.get('results', []): go_terms.append({ "id": annotation.get('goId'), "term": annotation.get('goName'), "aspect": annotation.get('goAspect'), "evidence": annotation.get('goEvidence'), "reference": annotation.get('reference') }) return go_terms except requests.exceptions.RequestException as e: print(f"Error fetching GO terms for {uniprot_id}: {str(e)}") return [] def main(): email = "your_email@example.com" # Replace with your actual email protein_name = input("What protein would you like to know about? ") print(f"\nFetching information for: {protein_name}") accession, full_name = fetch_protein_info(protein_name) if not accession: print(f"No results found for '{protein_name}'") return print(f"Protein: {full_name}") print(f"Accession: {accession}") all_data = { "uniprot": fetch_uniprot_info(accession, email), "interpro": fetch_comprehensive_interpro_info(accession, email), "pdb": fetch_pdb_info(accession, email), "go_terms": fetch_protein_go_terms(accession, email) } # Save the data to a JSON file filename = f"{accession}_comprehensive_info.json" with open(filename, 'w') as f: json.dump(all_data, f, indent=2) print(f"\nComprehensive information has been saved to {filename}") if __name__ == "__main__": main()