nakulk02 commited on
Commit
ef85c96
·
verified ·
1 Parent(s): cda13f7

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +170 -0
  2. fetch_all_proteins.py +238 -0
  3. requirements.txt +76 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import gradio as gr
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from fetch_all_proteins import *
6
+ import json
7
+ import requests
8
+
9
+
10
+ load_dotenv()
11
+
12
+ email = os.getenv("EMAIL")
13
+
14
+
15
+
16
+ client = OpenAI(
17
+ api_key=os.environ.get("OPENAI_API_KEY"),
18
+ )
19
+
20
+ def chatApiCall(messages):
21
+ payload = {
22
+ "messages": messages,
23
+ "web_access": False
24
+ }
25
+ url = "https://open-ai21.p.rapidapi.com/claude3"
26
+ headers = {
27
+ "x-rapidapi-key": os.environ.get("OPENAI_API_KEY"),
28
+ "x-rapidapi-host": "open-ai21.p.rapidapi.com",
29
+ "Content-Type": "application/json"
30
+ }
31
+ # response = requests.post(url, json=payload, headers=headers)
32
+ response = client.chat.completions.create(
33
+ messages=messages,
34
+ model="gpt-4o-mini",
35
+ )
36
+ res_json=response.choices[0].message.content
37
+ print("response",res_json)
38
+ return res_json
39
+
40
+ ## Create a function that determines whether the request relates to a protein or not
41
+ def IsProteinRequest(history, message):
42
+ prompt = """Respond only with true when the conditions below are met otherwise respond only with false. The conditions
43
+ are as follows:
44
+ 1: Within the context of the chat history, the message refers to a specific protein.
45
+ 2: A specific protein name is mentioned in the message. GCPR proteins alone does not count.
46
+ 3: If there are no proteins mentioned in the chat history, there should be a specific protein name mentioned in the message.
47
+ 4: If there are no chat histories, look at the following message.
48
+ 5: If you detect any generalized requests like "Tell me about proteins" "Tell me about receptors" or any request that has no
49
+ specific protein mentioned like Rhodopsin or OR51E2, respond with false.
50
+ The message is as follows: """ + message
51
+ history.append({"role":"user", "content": f"{prompt}"})
52
+ # function_client = OpenAI(
53
+ # api_key=os.environ.get("OPENAI_API_KEY")
54
+ # )
55
+ # function_client = OpenAI()
56
+ # response = function_client.chat.completions.create(
57
+ # model="gpt-4o-mini",
58
+ # messages=history,
59
+
60
+ # )
61
+ response=chatApiCall(history)
62
+ # print("prompt",history,"res",response)
63
+ return response
64
+
65
+ ## Create a function that returns the name of the protein
66
+ def ProteinName(history, message):
67
+ prompt = """Respond only with the name of the protein the message is referring to with respect to both
68
+ the chat history above and the message itself. The message is as follows: """ + message
69
+
70
+ history.append({"role":"user", "content": f"{prompt}"})
71
+ # function_client = OpenAI(
72
+ # api_key=os.environ.get("OPENAI_API_KEY")
73
+ # )
74
+ # function_client = OpenAI()
75
+ # response = function_client.chat.completions.create(
76
+ # model="gpt-4o-mini",
77
+ # messages=history,
78
+
79
+ # )
80
+ response=chatApiCall(history)
81
+ # print("prompt protein name",history,"res",response)
82
+ return response
83
+
84
+
85
+ ## Create a function that takes in a protein name and returns protein info
86
+ def ProteinInfo(protein):
87
+ print("caall hua hai",protein)
88
+ accession, full_name = fetch_protein_info(protein)
89
+ all_data = {
90
+ "uniprot": fetch_uniprot_info(accession, email),
91
+ "interpro": fetch_comprehensive_interpro_info(accession, email),
92
+ # "string": fetch_string_info(accession, 9606, email), # Assuming human (9606)
93
+ "quickgo": {}
94
+ }
95
+
96
+ go_terms = fetch_protein_go_terms(accession, email)
97
+ all_data["quickgo"]["go_terms"] = go_terms
98
+ # for go_term in go_terms:
99
+ # all_data["quickgo"][go_term] = fetch_go_info(go_term, email)
100
+ # print(all_data)
101
+ # with open(f"{protein}.json",'w') as f:
102
+ # json.dump(all_data,f)
103
+ return json.dumps(all_data)
104
+
105
+ ## Create a function that takes in a message and a protein information and returns an informed response
106
+ def InformedResponse(proteinInfo, message):
107
+ prompt = f"{proteinInfo} From the following information given, answer this question: " + message
108
+ history=[]
109
+ Agent = {"role": "system", "content": "You are a helpful assistant with extensive background in protein analysis."}
110
+ history.append(Agent)
111
+ history.append({"role":"user", "content": f"{prompt}"})
112
+ # function_client = OpenAI(
113
+ # api_key=os.environ.get("OPENAI_API_KEY")
114
+ # )
115
+ # function_client = OpenAI()
116
+ # response = function_client.chat.completions.create(
117
+ # model="gpt-4o-mini",
118
+ # messages=history,
119
+ # )
120
+ response=chatApiCall(history)
121
+ # print("prompt",history,"res",response)
122
+ return response
123
+
124
+ def HistoryConverter(history):
125
+ Agent = {"role": "system", "content": "You are a helpful assistant with extensive background in protein analysis."}
126
+
127
+ formatted_history = []
128
+ formatted_history.append(Agent)
129
+ for each in history:
130
+ formatted_history.append({"role": "user", "content": f"{each[0]}"})
131
+ formatted_history.append({"role": "assistant", "content": f"{each[1]}"})
132
+ return formatted_history
133
+
134
+
135
+
136
+ def openai_chatbot(message, history):
137
+ formatted_history = HistoryConverter(history=history)
138
+
139
+ isProteinRequest = IsProteinRequest(history=formatted_history, message=message)
140
+
141
+ if isProteinRequest == "true":
142
+ proteinName = ProteinName(history=formatted_history, message=message)
143
+ proteinInfo = ProteinInfo(protein=proteinName)
144
+ print(proteinName,proteinInfo)
145
+ return InformedResponse(proteinInfo=proteinInfo, message=message)
146
+ else:
147
+ # client = OpenAI()
148
+ messages = HistoryConverter(history=history)
149
+ messages.append({"role":"user","content": f"{message}"})
150
+ # payload = {
151
+ # "messages": messages,
152
+ # "web_access": False
153
+ # }
154
+ # response=chatApiCall({"messages":history})
155
+
156
+
157
+ # response = client.chat.completions.create(
158
+ # model="gpt-4o-mini",
159
+ # # messages=[{"role":"system", "content": "You are a helpful assistant"}, {"role":"user", "content":"Tell me about peter pan"}]
160
+ # messages = messages
161
+
162
+ # )
163
+
164
+ response=chatApiCall(messages)
165
+ # print("prompt",messages,"res",response)
166
+ return response
167
+
168
+ if __name__=="__main__":
169
+ demo_chatbot = gr.ChatInterface(openai_chatbot, title="OpenAI Chatbot", description="Start Chatting")
170
+ demo_chatbot.launch()
fetch_all_proteins.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import time
4
+ import urllib.parse
5
+
6
+
7
+ def fetch_protein_info(protein_name):
8
+ url = "https://rest.uniprot.org/uniprotkb/search"
9
+ params = {
10
+ "query": protein_name,
11
+ "format": "json",
12
+ "fields": "accession,id,protein_name,gene_names,organism_name,reviewed",
13
+ "size": 10 # Increase size to get more results
14
+ }
15
+ try:
16
+ response = requests.get(url, params=params)
17
+ response.raise_for_status()
18
+ data = response.json()
19
+ if data.get('results'):
20
+ # Try to find an exact match for the gene name first
21
+ for result in data['results']:
22
+ gene_names = result.get('genes', [])
23
+ if gene_names and any(gene.get('geneName', {}).get('value') == protein_name for gene in gene_names):
24
+ print("Exact gene match found:", result)
25
+ return process_result(result)
26
+
27
+ # If no exact match, return the first reviewed (Swiss-Prot) entry or the first result
28
+ reviewed_result = next((r for r in data['results'] if r.get('entryType') == 'UniProtKB reviewed (Swiss-Prot)'), None)
29
+ if reviewed_result:
30
+ print("Reviewed entry found:", reviewed_result)
31
+ return process_result(reviewed_result)
32
+ else:
33
+ print("Using first result:", data['results'][0])
34
+ return process_result(data['results'][0])
35
+ else:
36
+ print(f"No results found for '{protein_name}'")
37
+ return None, None
38
+ except requests.exceptions.RequestException as e:
39
+ print(f"Error occurred while fetching data: {e}")
40
+ return None, None
41
+
42
+
43
+ def process_result(result):
44
+ primary_accession = result.get('primaryAccession')
45
+ name = result.get('proteinName', [{}])[0].get('fullName', {}).get('value')
46
+ if not name:
47
+ name = result.get('proteinName', [{}])[0].get('shortName', [{}])[0].get('value')
48
+ if not name:
49
+ name = result.get('id')
50
+ return primary_accession, name
51
+
52
+ def fetch_uniprot_info(accession, email):
53
+ uniprot_base_url = "https://rest.uniprot.org/uniprotkb/"
54
+ headers = {
55
+ "Accept": "application/json",
56
+ "User-Agent": f"Python script (mailto:{email})"
57
+ }
58
+ try:
59
+ response = requests.get(f"{uniprot_base_url}{accession}", headers=headers)
60
+ response.raise_for_status()
61
+ uniprot_data = response.json()
62
+ protein_info = {
63
+ "accession": accession,
64
+ "entry_type": uniprot_data.get('entryType'),
65
+ "entry_name": uniprot_data.get('uniProtkbId'),
66
+ "protein_name": uniprot_data.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value'),
67
+ "gene_name": next((gene.get('geneName', {}).get('value') for gene in uniprot_data.get('genes', []) if gene.get('geneName')), None),
68
+ "organism": uniprot_data.get('organism', {}).get('scientificName'),
69
+ "sequence": uniprot_data.get('sequence', {}).get('value'),
70
+ "sequence_length": uniprot_data.get('sequence', {}).get('length'),
71
+ "function": next((comment.get('texts', [{}])[0].get('value') for comment in uniprot_data.get('comments', []) if comment.get('commentType') == 'FUNCTION'), None),
72
+ "subcellular_locations": [
73
+ loc.get('location', {}).get('value')
74
+ for comment in uniprot_data.get('comments', [])
75
+ if comment.get('commentType') == 'SUBCELLULAR LOCATION'
76
+ for loc in comment.get('subcellularLocations', [])
77
+ ],
78
+ "ec_numbers": [ec.get('value') for ec in uniprot_data.get('proteinDescription', {}).get('ecNumbers', [])],
79
+ "keywords": [kw.get('name') for kw in uniprot_data.get('keywords', [])],
80
+ "features": [{'type': f.get('type'), 'description': f.get('description')} for f in uniprot_data.get('features', [])]
81
+ }
82
+ return protein_info
83
+ except requests.exceptions.RequestException as e:
84
+ print(f"Error fetching UniProt data: {e}")
85
+ return None
86
+
87
+ def fetch_comprehensive_interpro_info(accession, email):
88
+ base_url = "https://www.ebi.ac.uk/interpro/api/protein/uniprot/"
89
+ protein_url = f"{base_url}{accession}/entry_protein_locations/"
90
+ headers = {
91
+ "Accept": "application/json",
92
+ "User-Agent": f"Python script (mailto:{email})"
93
+ }
94
+ try:
95
+ response = requests.get(protein_url, headers=headers)
96
+ response.raise_for_status()
97
+ interpro_data = response.json()
98
+ return interpro_data
99
+ except requests.exceptions.RequestException as e:
100
+ print(f"Error fetching InterPro data: {e}")
101
+ return None
102
+
103
+ def fetch_pdb_info(accession, email):
104
+ pdb_search_url = "https://search.rcsb.org/rcsbsearch/v2/query"
105
+ headers = {
106
+ "Content-Type": "application/json",
107
+ "Accept": "application/json",
108
+ "User-Agent": f"Python script (mailto:{email})"
109
+ }
110
+
111
+ # Construct the search query
112
+ query = {
113
+ "query": {
114
+ "type": "terminal",
115
+ "service": "text",
116
+ "parameters": {
117
+ "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
118
+ "operator": "exact_match",
119
+ "value": accession
120
+ }
121
+ },
122
+ "return_type": "entry",
123
+ "request_options": {
124
+ "return_all_hits": True
125
+ }
126
+ }
127
+
128
+ try:
129
+ # Perform the search
130
+ response = requests.post(pdb_search_url, headers=headers, data=json.dumps(query))
131
+ response.raise_for_status()
132
+ search_results = response.json()
133
+ pdb_ids = [result['identifier'] for result in search_results.get('result_set', [])]
134
+
135
+ if not pdb_ids:
136
+ # No PDB entries found
137
+ return {
138
+ "message": "Protein not found in PDB.",
139
+ "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
140
+ }
141
+
142
+ pdb_info_list = []
143
+ for pdb_id in pdb_ids:
144
+ time.sleep(0.1) # Be polite to the API
145
+ pdb_entry_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
146
+ response = requests.get(pdb_entry_url, headers=headers)
147
+ response.raise_for_status()
148
+ pdb_data = response.json()
149
+ pdb_info = {
150
+ "pdb_id": pdb_id,
151
+ "title": pdb_data.get('struct', {}).get('title'),
152
+ "deposition_date": pdb_data.get('rcsb_accession_info', {}).get('deposit_date'),
153
+ "release_date": pdb_data.get('rcsb_accession_info', {}).get('initial_release_date'),
154
+ "experimental_method": pdb_data.get('exptl', [{}])[0].get('method'),
155
+ "resolution": pdb_data.get('rcsb_entry_info', {}).get('resolution_combined', [None])[0],
156
+ "authors": [author.get("name") for author in pdb_data.get("audit_author", [])],
157
+ "ligands": [],
158
+ "pdb_structure_link": f"https://www.rcsb.org/3d-view/{pdb_id}"
159
+ }
160
+ # Fetch ligand information
161
+ ligand_entities = pdb_data.get('nonpolymer_entities', [])
162
+ for ligand in ligand_entities:
163
+ chem_comp = ligand.get('chem_comp', {})
164
+ ligand_info = {
165
+ "chem_comp_id": chem_comp.get('id'),
166
+ "name": chem_comp.get('name'),
167
+ "formula": chem_comp.get('formula'),
168
+ "weight": chem_comp.get('formula_weight')
169
+ }
170
+ pdb_info['ligands'].append(ligand_info)
171
+ pdb_info_list.append(pdb_info)
172
+ return {"pdb_entries": pdb_info_list}
173
+ except requests.exceptions.RequestException as e:
174
+ print(f"Error fetching PDB data: {e}")
175
+ return {
176
+ "message": "Error fetching PDB data.",
177
+ "alphafold_link": f"https://alphafold.ebi.ac.uk/entry/{accession}"
178
+ }
179
+
180
+ def fetch_protein_go_terms(uniprot_id, email):
181
+ base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
182
+ headers = {
183
+ "Accept": "application/json",
184
+ "User-Agent": f"Python script (mailto:{email})"
185
+ }
186
+ params = {
187
+ "geneProductId": uniprot_id,
188
+ "limit": 10 # Limit to top 10 GO terms
189
+ }
190
+ try:
191
+ response = requests.get(base_url, params=params, headers=headers)
192
+ response.raise_for_status()
193
+ data = response.json()
194
+ go_terms = []
195
+ for annotation in data.get('results', []):
196
+ go_terms.append({
197
+ "id": annotation.get('goId'),
198
+ "term": annotation.get('goName'),
199
+ "aspect": annotation.get('goAspect'),
200
+ "evidence": annotation.get('goEvidence'),
201
+ "reference": annotation.get('reference')
202
+ })
203
+ return go_terms
204
+ except requests.exceptions.RequestException as e:
205
+ print(f"Error fetching GO terms for {uniprot_id}: {str(e)}")
206
+ return []
207
+
208
+
209
+ def main():
210
+ email = "your_email@example.com" # Replace with your actual email
211
+ protein_name = input("What protein would you like to know about? ")
212
+
213
+ print(f"\nFetching information for: {protein_name}")
214
+ accession, full_name = fetch_protein_info(protein_name)
215
+
216
+ if not accession:
217
+ print(f"No results found for '{protein_name}'")
218
+ return
219
+
220
+ print(f"Protein: {full_name}")
221
+ print(f"Accession: {accession}")
222
+
223
+ all_data = {
224
+ "uniprot": fetch_uniprot_info(accession, email),
225
+ "interpro": fetch_comprehensive_interpro_info(accession, email),
226
+ "pdb": fetch_pdb_info(accession, email),
227
+ "go_terms": fetch_protein_go_terms(accession, email)
228
+ }
229
+
230
+ # Save the data to a JSON file
231
+ filename = f"{accession}_comprehensive_info.json"
232
+ with open(filename, 'w') as f:
233
+ json.dump(all_data, f, indent=2)
234
+
235
+ print(f"\nComprehensive information has been saved to {filename}")
236
+
237
+ if __name__ == "__main__":
238
+ main()
requirements.txt ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+ aiofiles==23.2.1
3
+ annotated-types==0.7.0
4
+ anyio==4.4.0
5
+ cachetools==5.4.0
6
+ certifi==2024.7.4
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ contourpy==1.2.1
10
+ cycler==0.12.1
11
+ fastapi==0.112.0
12
+ ffmpy==0.4.0
13
+ filelock==3.15.4
14
+ fonttools==4.53.1
15
+ fsspec==2024.6.1
16
+ google-ai-generativelanguage==0.6.6
17
+ google-api-core==2.19.1
18
+ google-api-python-client==2.140.0
19
+ google-auth==2.33.0
20
+ google-auth-httplib2==0.2.0
21
+ google-generativeai==0.7.2
22
+ googleapis-common-protos==1.63.2
23
+ gradio==4.41.0
24
+ gradio_client==1.3.0
25
+ grpcio==1.65.4
26
+ grpcio-status==1.62.3
27
+ h11==0.14.0
28
+ httpcore==1.0.5
29
+ httplib2==0.22.0
30
+ httpx==0.27.0
31
+ idna==3.7
32
+ importlib_resources==6.4.0
33
+ Jinja2==3.1.4
34
+ kiwisolver==1.4.5
35
+ markdown-it-py==3.0.0
36
+ MarkupSafe==2.1.5
37
+ matplotlib==3.9.2
38
+ mdurl==0.1.2
39
+ numpy==2.0.1
40
+ orjson==3.10.7
41
+ packaging==24.1
42
+ pandas==2.2.2
43
+ pillow==10.4.0
44
+ proto-plus==1.24.0
45
+ protobuf==4.25.4
46
+ pyasn1==0.6.0
47
+ pyasn1_modules==0.4.0
48
+ pydantic==2.8.2
49
+ pydantic_core==2.20.1
50
+ pydub==0.25.1
51
+ Pygments==2.18.0
52
+ pyparsing==3.1.2
53
+ python-dateutil==2.9.0.post0
54
+ python-dotenv==1.0.1
55
+ python-multipart==0.0.9
56
+ pytz==2024.1
57
+ PyYAML==6.0.2
58
+ requests==2.32.3
59
+ rich==13.7.1
60
+ rsa==4.9
61
+ ruff==0.5.7
62
+ semantic-version==2.10.0
63
+ shellingham==1.5.4
64
+ six==1.16.0
65
+ sniffio==1.3.1
66
+ starlette==0.37.2
67
+ tomlkit==0.12.0
68
+ tqdm==4.66.5
69
+ typer==0.12.3
70
+ typing_extensions==4.12.2
71
+ tzdata==2024.1
72
+ uritemplate==4.1.1
73
+ urllib3==2.2.2
74
+ uvicorn==0.30.6
75
+ websockets==12.0
76
+ openai==1.52.0