Protein_Chatbot / fetch_all_proteins.py
nakulk02's picture
Upload 3 files
ef85c96 verified
import requests
import json
import time
import urllib.parse
def fetch_protein_info(protein_name):
url = "https://rest.uniprot.org/uniprotkb/search"
params = {
"query": protein_name,
"format": "json",
"fields": "accession,id,protein_name,gene_names,organism_name,reviewed",
"size": 10 # Increase size to get more results
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
if data.get('results'):
# Try to find an exact match for the gene name first
for result in data['results']:
gene_names = result.get('genes', [])
if gene_names and any(gene.get('geneName', {}).get('value') == protein_name for gene in gene_names):
print("Exact gene match found:", result)
return process_result(result)
# If no exact match, return the first reviewed (Swiss-Prot) entry or the first result
reviewed_result = next((r for r in data['results'] if r.get('entryType') == 'UniProtKB reviewed (Swiss-Prot)'), None)
if reviewed_result:
print("Reviewed entry found:", reviewed_result)
return process_result(reviewed_result)
else:
print("Using first result:", data['results'][0])
return process_result(data['results'][0])
else:
print(f"No results found for '{protein_name}'")
return None, None
except requests.exceptions.RequestException as e:
print(f"Error occurred while fetching data: {e}")
return None, None
def process_result(result):
primary_accession = result.get('primaryAccession')
name = result.get('proteinName', [{}])[0].get('fullName', {}).get('value')
if not name:
name = result.get('proteinName', [{}])[0].get('shortName', [{}])[0].get('value')
if not name:
name = result.get('id')
return primary_accession, name
def fetch_uniprot_info(accession, email):
uniprot_base_url = "https://rest.uniprot.org/uniprotkb/"
headers = {
"Accept": "application/json",
"User-Agent": f"Python script (mailto:{email})"
}
try:
response = requests.get(f"{uniprot_base_url}{accession}", headers=headers)
response.raise_for_status()
uniprot_data = response.json()
protein_info = {
"accession": accession,
"entry_type": uniprot_data.get('entryType'),
"entry_name": uniprot_data.get('uniProtkbId'),
"protein_name": uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value'),
"gene_name": next((gene.get('geneName', {}).get('value') for gene in uniprot_data.get('genes', []) if gene.get('geneName')), None),
"organism": uniprot_data.get('organism', {}).get('scientificName'),
"sequence": uniprot_data.get('sequence', {}).get('value'),
"sequence_length": uniprot_data.get('sequence', {}).get('length'),
"function": next((comment.get('texts', [{}])[0].get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'FUNCTION'), None),
"subcellular_locations": [
loc.get('location', {}).get('value')
for comment in uniprot_data.get('comments', [])
if comment.get('commentType') == 'SUBCELLULAR LOCATION'
for loc in comment.get('subcellularLocations', [])
],
"ec_numbers": [ec.get('value') for ec in uniprot_data.get('proteinDescription', {}).get('ecNumbers', [])],
"keywords": [kw.get('name') for kw in uniprot_data.get('keywords', [])],
"features": [{'type': f.get('type'), 'description': f.get('description')} for f in uniprot_data.get('features', [])]
}
return protein_info
except requests.exceptions.RequestException as e:
print(f"Error fetching UniProt data: {e}")
return None
def fetch_comprehensive_interpro_info(accession, email):
base_url = "https://www.ebi.ac.uk/interpro/api/protein/uniprot/"
protein_url = f"{base_url}{accession}/entry_protein_locations/"
headers = {
"Accept": "application/json",
"User-Agent": f"Python script (mailto:{email})"
}
try:
response = requests.get(protein_url, headers=headers)
response.raise_for_status()
interpro_data = response.json()
return interpro_data
except requests.exceptions.RequestException as e:
print(f"Error fetching InterPro data: {e}")
return None
def fetch_pdb_info(accession, email):
pdb_search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"User-Agent": f"Python script (mailto:{email})"
}
# Construct the search query
query = {
"query": {
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
"operator": "exact_match",
"value": accession
}
},
"return_type": "entry",
"request_options": {
"return_all_hits": True
}
}
try:
# Perform the search
response = requests.post(pdb_search_url, headers=headers, data=json.dumps(query))
response.raise_for_status()
search_results = response.json()
pdb_ids = [result['identifier'] for result in search_results.get('result_set', [])]
if not pdb_ids:
# No PDB entries found
return {
"message": "Protein not found in PDB.",
"alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
}
pdb_info_list = []
for pdb_id in pdb_ids:
time.sleep(0.1) # Be polite to the API
pdb_entry_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
response = requests.get(pdb_entry_url, headers=headers)
response.raise_for_status()
pdb_data = response.json()
pdb_info = {
"pdb_id": pdb_id,
"title": pdb_data.get('struct', {}).get('title'),
"deposition_date": pdb_data.get('rcsb_accession_info', {}).get('deposit_date'),
"release_date": pdb_data.get('rcsb_accession_info', {}).get('initial_release_date'),
"experimental_method": pdb_data.get('exptl', [{}])[0].get('method'),
"resolution": pdb_data.get('rcsb_entry_info', {}).get('resolution_combined', [None])[0],
"authors": [author.get("name") for author in pdb_data.get("audit_author", [])],
"ligands": [],
"pdb_structure_link": f"https://www.rcsb.org/3d-view/{pdb_id}"
}
# Fetch ligand information
ligand_entities = pdb_data.get('nonpolymer_entities', [])
for ligand in ligand_entities:
chem_comp = ligand.get('chem_comp', {})
ligand_info = {
"chem_comp_id": chem_comp.get('id'),
"name": chem_comp.get('name'),
"formula": chem_comp.get('formula'),
"weight": chem_comp.get('formula_weight')
}
pdb_info['ligands'].append(ligand_info)
pdb_info_list.append(pdb_info)
return {"pdb_entries": pdb_info_list}
except requests.exceptions.RequestException as e:
print(f"Error fetching PDB data: {e}")
return {
"message": "Error fetching PDB data.",
"alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
}
def fetch_protein_go_terms(uniprot_id, email):
base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
headers = {
"Accept": "application/json",
"User-Agent": f"Python script (mailto:{email})"
}
params = {
"geneProductId": uniprot_id,
"limit": 10 # Limit to top 10 GO terms
}
try:
response = requests.get(base_url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
go_terms = []
for annotation in data.get('results', []):
go_terms.append({
"id": annotation.get('goId'),
"term": annotation.get('goName'),
"aspect": annotation.get('goAspect'),
"evidence": annotation.get('goEvidence'),
"reference": annotation.get('reference')
})
return go_terms
except requests.exceptions.RequestException as e:
print(f"Error fetching GO terms for {uniprot_id}: {str(e)}")
return []
def main():
email = "your_email@example.com" # Replace with your actual email
protein_name = input("What protein would you like to know about? ")
print(f"\nFetching information for: {protein_name}")
accession, full_name = fetch_protein_info(protein_name)
if not accession:
print(f"No results found for '{protein_name}'")
return
print(f"Protein: {full_name}")
print(f"Accession: {accession}")
all_data = {
"uniprot": fetch_uniprot_info(accession, email),
"interpro": fetch_comprehensive_interpro_info(accession, email),
"pdb": fetch_pdb_info(accession, email),
"go_terms": fetch_protein_go_terms(accession, email)
}
# Save the data to a JSON file
filename = f"{accession}_comprehensive_info.json"
with open(filename, 'w') as f:
json.dump(all_data, f, indent=2)
print(f"\nComprehensive information has been saved to {filename}")
if __name__ == "__main__":
main()