Spaces:

ProteinSage
/

Protein_Chatbot

Sleeping

File size: 9,998 Bytes

ef85c96

import requests
import json
import time
import urllib.parse


def fetch_protein_info(protein_name):
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": protein_name,
        "format": "json",
        "fields": "accession,id,protein_name,gene_names,organism_name,reviewed",
        "size": 10  # Increase size to get more results
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        if data.get('results'):
            # Try to find an exact match for the gene name first
            for result in data['results']:
                gene_names = result.get('genes', [])
                if gene_names and any(gene.get('geneName', {}).get('value') == protein_name for gene in gene_names):
                    print("Exact gene match found:", result)
                    return process_result(result)
            
            # If no exact match, return the first reviewed (Swiss-Prot) entry or the first result
            reviewed_result = next((r for r in data['results'] if r.get('entryType') == 'UniProtKB reviewed (Swiss-Prot)'), None)
            if reviewed_result:
                print("Reviewed entry found:", reviewed_result)
                return process_result(reviewed_result)
            else:
                print("Using first result:", data['results'][0])
                return process_result(data['results'][0])
        else:
            print(f"No results found for '{protein_name}'")
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching data: {e}")
        return None, None
    

def process_result(result):
    primary_accession = result.get('primaryAccession')
    name = result.get('proteinName', [{}])[0].get('fullName', {}).get('value')
    if not name:
        name = result.get('proteinName', [{}])[0].get('shortName', [{}])[0].get('value')
    if not name:
        name = result.get('id')
    return primary_accession, name

def fetch_uniprot_info(accession, email):
    uniprot_base_url = "https://rest.uniprot.org/uniprotkb/"
    headers = {
        "Accept": "application/json",
        "User-Agent": f"Python script (mailto:{email})"
    }
    try:
        response = requests.get(f"{uniprot_base_url}{accession}", headers=headers)
        response.raise_for_status()
        uniprot_data = response.json()
        protein_info = {
            "accession": accession,
            "entry_type": uniprot_data.get('entryType'),
            "entry_name": uniprot_data.get('uniProtkbId'),
            "protein_name": uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value'),
            "gene_name": next((gene.get('geneName', {}).get('value') for gene in uniprot_data.get('genes', []) if gene.get('geneName')), None),
            "organism": uniprot_data.get('organism', {}).get('scientificName'),
            "sequence": uniprot_data.get('sequence', {}).get('value'),
            "sequence_length": uniprot_data.get('sequence', {}).get('length'),
            "function": next((comment.get('texts', [{}])[0].get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'FUNCTION'), None),
            "subcellular_locations": [
                loc.get('location', {}).get('value')
                for comment in uniprot_data.get('comments', [])
                if comment.get('commentType') == 'SUBCELLULAR LOCATION'
                for loc in comment.get('subcellularLocations', [])
            ],
            "ec_numbers": [ec.get('value') for ec in uniprot_data.get('proteinDescription', {}).get('ecNumbers', [])],
            "keywords": [kw.get('name') for kw in uniprot_data.get('keywords', [])],
            "features": [{'type': f.get('type'), 'description': f.get('description')} for f in uniprot_data.get('features', [])]
        }
        return protein_info
    except requests.exceptions.RequestException as e:
        print(f"Error fetching UniProt data: {e}")
        return None

def fetch_comprehensive_interpro_info(accession, email):
    base_url = "https://www.ebi.ac.uk/interpro/api/protein/uniprot/"
    protein_url = f"{base_url}{accession}/entry_protein_locations/"
    headers = {
        "Accept": "application/json",
        "User-Agent": f"Python script (mailto:{email})"
    }
    try:
        response = requests.get(protein_url, headers=headers)
        response.raise_for_status()
        interpro_data = response.json()
        return interpro_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching InterPro data: {e}")
        return None

def fetch_pdb_info(accession, email):
    pdb_search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "User-Agent": f"Python script (mailto:{email})"
    }

    # Construct the search query
    query = {
        "query": {
            "type": "terminal",
            "service": "text",
            "parameters": {
                "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
                "operator": "exact_match",
                "value": accession
            }
        },
        "return_type": "entry",
        "request_options": {
            "return_all_hits": True
        }
    }

    try:
        # Perform the search
        response = requests.post(pdb_search_url, headers=headers, data=json.dumps(query))
        response.raise_for_status()
        search_results = response.json()
        pdb_ids = [result['identifier'] for result in search_results.get('result_set', [])]

        if not pdb_ids:
            # No PDB entries found
            return {
                "message": "Protein not found in PDB.",
                "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
            }

        pdb_info_list = []
        for pdb_id in pdb_ids:
            time.sleep(0.1)  # Be polite to the API
            pdb_entry_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
            response = requests.get(pdb_entry_url, headers=headers)
            response.raise_for_status()
            pdb_data = response.json()
            pdb_info = {
                "pdb_id": pdb_id,
                "title": pdb_data.get('struct', {}).get('title'),
                "deposition_date": pdb_data.get('rcsb_accession_info', {}).get('deposit_date'),
                "release_date": pdb_data.get('rcsb_accession_info', {}).get('initial_release_date'),
                "experimental_method": pdb_data.get('exptl', [{}])[0].get('method'),
                "resolution": pdb_data.get('rcsb_entry_info', {}).get('resolution_combined', [None])[0],
                "authors": [author.get("name") for author in pdb_data.get("audit_author", [])],
                "ligands": [],
                "pdb_structure_link": f"https://www.rcsb.org/3d-view/{pdb_id}"
            }
            # Fetch ligand information
            ligand_entities = pdb_data.get('nonpolymer_entities', [])
            for ligand in ligand_entities:
                chem_comp = ligand.get('chem_comp', {})
                ligand_info = {
                    "chem_comp_id": chem_comp.get('id'),
                    "name": chem_comp.get('name'),
                    "formula": chem_comp.get('formula'),
                    "weight": chem_comp.get('formula_weight')
                }
                pdb_info['ligands'].append(ligand_info)
            pdb_info_list.append(pdb_info)
        return {"pdb_entries": pdb_info_list}
    except requests.exceptions.RequestException as e:
        print(f"Error fetching PDB data: {e}")
        return {
            "message": "Error fetching PDB data.",
            "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
        }

def fetch_protein_go_terms(uniprot_id, email):
    base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
    headers = {
        "Accept": "application/json",
        "User-Agent": f"Python script (mailto:{email})"
    }
    params = {
        "geneProductId": uniprot_id,
        "limit": 10  # Limit to top 10 GO terms
    }
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        go_terms = []
        for annotation in data.get('results', []):
            go_terms.append({
                "id": annotation.get('goId'),
                "term": annotation.get('goName'),
                "aspect": annotation.get('goAspect'),
                "evidence": annotation.get('goEvidence'),
                "reference": annotation.get('reference')
            })
        return go_terms
    except requests.exceptions.RequestException as e:
        print(f"Error fetching GO terms for {uniprot_id}: {str(e)}")
        return []
    

def main():
    email = "your_email@example.com"  # Replace with your actual email
    protein_name = input("What protein would you like to know about? ")

    print(f"\nFetching information for: {protein_name}")
    accession, full_name = fetch_protein_info(protein_name)

    if not accession:
        print(f"No results found for '{protein_name}'")
        return

    print(f"Protein: {full_name}")
    print(f"Accession: {accession}")

    all_data = {
        "uniprot": fetch_uniprot_info(accession, email),
        "interpro": fetch_comprehensive_interpro_info(accession, email),
        "pdb": fetch_pdb_info(accession, email),
        "go_terms": fetch_protein_go_terms(accession, email)
    }

    # Save the data to a JSON file
    filename = f"{accession}_comprehensive_info.json"
    with open(filename, 'w') as f:
        json.dump(all_data, f, indent=2)

    print(f"\nComprehensive information has been saved to {filename}")

if __name__ == "__main__":
    main()