Spaces:

ProteinSage
/

Protein_Chatbot

Sleeping

App Files Files Community

Protein_Chatbot / fetch_all_proteins.py

nakulk02

Upload 3 files

ef85c96 verified over 1 year ago

raw

history blame contribute delete

10 kB

	import requests
	import json
	import time
	import urllib.parse


	def fetch_protein_info(protein_name):
	url = "https://rest.uniprot.org/uniprotkb/search"
	params = {
	"query": protein_name,
	"format": "json",
	"fields": "accession,id,protein_name,gene_names,organism_name,reviewed",
	"size": 10 # Increase size to get more results
	}
	try:
	response = requests.get(url, params=params)
	response.raise_for_status()
	data = response.json()
	if data.get('results'):
	# Try to find an exact match for the gene name first
	for result in data['results']:
	gene_names = result.get('genes', [])
	if gene_names and any(gene.get('geneName', {}).get('value') == protein_name for gene in gene_names):
	print("Exact gene match found:", result)
	return process_result(result)

	# If no exact match, return the first reviewed (Swiss-Prot) entry or the first result
	reviewed_result = next((r for r in data['results'] if r.get('entryType') == 'UniProtKB reviewed (Swiss-Prot)'), None)
	if reviewed_result:
	print("Reviewed entry found:", reviewed_result)
	return process_result(reviewed_result)
	else:
	print("Using first result:", data['results'][0])
	return process_result(data['results'][0])
	else:
	print(f"No results found for '{protein_name}'")
	return None, None
	except requests.exceptions.RequestException as e:
	print(f"Error occurred while fetching data: {e}")
	return None, None


	def process_result(result):
	primary_accession = result.get('primaryAccession')
	name = result.get('proteinName', [{}])[0].get('fullName', {}).get('value')
	if not name:
	name = result.get('proteinName', [{}])[0].get('shortName', [{}])[0].get('value')
	if not name:
	name = result.get('id')
	return primary_accession, name

	def fetch_uniprot_info(accession, email):
	uniprot_base_url = "https://rest.uniprot.org/uniprotkb/"
	headers = {
	"Accept": "application/json",
	"User-Agent": f"Python script (mailto:{email})"
	}
	try:
	response = requests.get(f"{uniprot_base_url}{accession}", headers=headers)
	response.raise_for_status()
	uniprot_data = response.json()
	protein_info = {
	"accession": accession,
	"entry_type": uniprot_data.get('entryType'),
	"entry_name": uniprot_data.get('uniProtkbId'),
	"protein_name": uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value'),
	"gene_name": next((gene.get('geneName', {}).get('value') for gene in uniprot_data.get('genes', []) if gene.get('geneName')), None),
	"organism": uniprot_data.get('organism', {}).get('scientificName'),
	"sequence": uniprot_data.get('sequence', {}).get('value'),
	"sequence_length": uniprot_data.get('sequence', {}).get('length'),
	"function": next((comment.get('texts', [{}])[0].get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'FUNCTION'), None),
	"subcellular_locations": [
	loc.get('location', {}).get('value')
	for comment in uniprot_data.get('comments', [])
	if comment.get('commentType') == 'SUBCELLULAR LOCATION'
	for loc in comment.get('subcellularLocations', [])
	],
	"ec_numbers": [ec.get('value') for ec in uniprot_data.get('proteinDescription', {}).get('ecNumbers', [])],
	"keywords": [kw.get('name') for kw in uniprot_data.get('keywords', [])],
	"features": [{'type': f.get('type'), 'description': f.get('description')} for f in uniprot_data.get('features', [])]
	}
	return protein_info
	except requests.exceptions.RequestException as e:
	print(f"Error fetching UniProt data: {e}")
	return None

	def fetch_comprehensive_interpro_info(accession, email):
	base_url = "https://www.ebi.ac.uk/interpro/api/protein/uniprot/"
	protein_url = f"{base_url}{accession}/entry_protein_locations/"
	headers = {
	"Accept": "application/json",
	"User-Agent": f"Python script (mailto:{email})"
	}
	try:
	response = requests.get(protein_url, headers=headers)
	response.raise_for_status()
	interpro_data = response.json()
	return interpro_data
	except requests.exceptions.RequestException as e:
	print(f"Error fetching InterPro data: {e}")
	return None

	def fetch_pdb_info(accession, email):
	pdb_search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
	headers = {
	"Content-Type": "application/json",
	"Accept": "application/json",
	"User-Agent": f"Python script (mailto:{email})"
	}

	# Construct the search query
	query = {
	"query": {
	"type": "terminal",
	"service": "text",
	"parameters": {
	"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
	"operator": "exact_match",
	"value": accession
	}
	},
	"return_type": "entry",
	"request_options": {
	"return_all_hits": True
	}
	}

	try:
	# Perform the search
	response = requests.post(pdb_search_url, headers=headers, data=json.dumps(query))
	response.raise_for_status()
	search_results = response.json()
	pdb_ids = [result['identifier'] for result in search_results.get('result_set', [])]

	if not pdb_ids:
	# No PDB entries found
	return {
	"message": "Protein not found in PDB.",
	"alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
	}

	pdb_info_list = []
	for pdb_id in pdb_ids:
	time.sleep(0.1) # Be polite to the API
	pdb_entry_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
	response = requests.get(pdb_entry_url, headers=headers)
	response.raise_for_status()
	pdb_data = response.json()
	pdb_info = {
	"pdb_id": pdb_id,
	"title": pdb_data.get('struct', {}).get('title'),
	"deposition_date": pdb_data.get('rcsb_accession_info', {}).get('deposit_date'),
	"release_date": pdb_data.get('rcsb_accession_info', {}).get('initial_release_date'),
	"experimental_method": pdb_data.get('exptl', [{}])[0].get('method'),
	"resolution": pdb_data.get('rcsb_entry_info', {}).get('resolution_combined', [None])[0],
	"authors": [author.get("name") for author in pdb_data.get("audit_author", [])],
	"ligands": [],
	"pdb_structure_link": f"https://www.rcsb.org/3d-view/{pdb_id}"
	}
	# Fetch ligand information
	ligand_entities = pdb_data.get('nonpolymer_entities', [])
	for ligand in ligand_entities:
	chem_comp = ligand.get('chem_comp', {})
	ligand_info = {
	"chem_comp_id": chem_comp.get('id'),
	"name": chem_comp.get('name'),
	"formula": chem_comp.get('formula'),
	"weight": chem_comp.get('formula_weight')
	}
	pdb_info['ligands'].append(ligand_info)
	pdb_info_list.append(pdb_info)
	return {"pdb_entries": pdb_info_list}
	except requests.exceptions.RequestException as e:
	print(f"Error fetching PDB data: {e}")
	return {
	"message": "Error fetching PDB data.",
	"alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
	}

	def fetch_protein_go_terms(uniprot_id, email):
	base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
	headers = {
	"Accept": "application/json",
	"User-Agent": f"Python script (mailto:{email})"
	}
	params = {
	"geneProductId": uniprot_id,
	"limit": 10 # Limit to top 10 GO terms
	}
	try:
	response = requests.get(base_url, params=params, headers=headers)
	response.raise_for_status()
	data = response.json()
	go_terms = []
	for annotation in data.get('results', []):
	go_terms.append({
	"id": annotation.get('goId'),
	"term": annotation.get('goName'),
	"aspect": annotation.get('goAspect'),
	"evidence": annotation.get('goEvidence'),
	"reference": annotation.get('reference')
	})
	return go_terms
	except requests.exceptions.RequestException as e:
	print(f"Error fetching GO terms for {uniprot_id}: {str(e)}")
	return []


	def main():
	email = "your_email@example.com" # Replace with your actual email
	protein_name = input("What protein would you like to know about? ")

	print(f"\nFetching information for: {protein_name}")
	accession, full_name = fetch_protein_info(protein_name)

	if not accession:
	print(f"No results found for '{protein_name}'")
	return

	print(f"Protein: {full_name}")
	print(f"Accession: {accession}")

	all_data = {
	"uniprot": fetch_uniprot_info(accession, email),
	"interpro": fetch_comprehensive_interpro_info(accession, email),
	"pdb": fetch_pdb_info(accession, email),
	"go_terms": fetch_protein_go_terms(accession, email)
	}

	# Save the data to a JSON file
	filename = f"{accession}_comprehensive_info.json"
	with open(filename, 'w') as f:
	json.dump(all_data, f, indent=2)

	print(f"\nComprehensive information has been saved to {filename}")

	if __name__ == "__main__":
	main()