Mohamed284 commited on
Commit
ab710db
·
1 Parent(s): d90d3ca

Update API keys and enhance chatbot functionality with Groq integration

Browse files
Files changed (3) hide show
  1. .env +1 -2
  2. app.py +74 -21
  3. main.ipynb +412 -65
.env CHANGED
@@ -1,8 +1,7 @@
1
  # API Configuration
2
  OPENAI_API_KEY="d1c9ed1ca70b9721dee1087d93f9662a"
3
  GEMINI_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
4
- # GCP_PROJECT_ID="1008673779731"
5
- # GCP_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
6
 
7
  GEMINI_API_KEY_1= "AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
8
  GEMINI_API_KEY_2= "AIzaSyDzQSzM9vA6Le36V65I2meN5URclq4JSx0"
 
1
  # API Configuration
2
  OPENAI_API_KEY="d1c9ed1ca70b9721dee1087d93f9662a"
3
  GEMINI_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
4
+ GROQ_API_KEY="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"
 
5
 
6
  GEMINI_API_KEY_1= "AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
7
  GEMINI_API_KEY_2= "AIzaSyDzQSzM9vA6Le36V65I2meN5URclq4JSx0"
app.py CHANGED
@@ -1,13 +1,16 @@
1
- # Combined Llama 3 and Gemini Flash Chatbot
 
 
 
 
2
  import json
3
  import logging
4
  import re
5
- import os
6
- import pickle
7
- from typing import List, Tuple, Optional
8
  import gradio as gr
9
  from openai import OpenAI
10
  import google.generativeai as genai
 
11
  from functools import lru_cache
12
  from tenacity import retry, stop_after_attempt, wait_exponential
13
  from langchain_community.retrievers import BM25Retriever
@@ -204,30 +207,37 @@ class EnhancedRetriever:
204
  context = []
205
  for doc in docs:
206
  context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
207
- **Application**: {doc.metadata['application']}
208
- **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
209
- **Strategy Excerpt**:\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
 
210
  context.append(context_str)
211
  return "\n\n---\n\n".join(context)
212
 
213
  # --- Generation System ---
214
- SYSTEM_PROMPT = """**Biomimicry Expert Guidelines**
215
- 1. Firstly Base answers strictly on context and if there is not context answer by your own.
216
- 2. Cite sources as [Source] witht the hyperlink
217
- 3. **Bold** technical terms
218
- 4. Include reference links at the end of the response
 
 
 
219
 
220
- Context: {context}"""
 
221
 
222
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
223
  def get_ai_response(query: str, context: str, model: str) -> str:
 
224
  try:
225
  if model == "gemini-2.0-flash":
226
  gemini_model = genai.GenerativeModel(model)
227
  response = gemini_model.generate_content(
228
  f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
229
  )
230
- return _postprocess_response(response.text)
 
231
  elif model == "meta-llama-3-70b-instruct":
232
  response = client.chat.completions.create(
233
  model=model,
@@ -238,7 +248,20 @@ def get_ai_response(query: str, context: str, model: str) -> str:
238
  temperature=0.4,
239
  max_tokens=2000
240
  )
241
- return _postprocess_response(response.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  except Exception as e:
243
  logger.error(f"Generation Error: {str(e)}")
244
  return "I'm unable to generate a response right now. Please try again later."
@@ -248,6 +271,36 @@ def _postprocess_response(response: str) -> str:
248
  response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
249
  return response
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  # --- Pipeline ---
252
  documents = load_and_chunk_data(data_file_name)
253
  retriever = EnhancedRetriever(documents)
@@ -266,22 +319,22 @@ def chat_interface(question: str, history: List[Tuple[str, str]], model: str):
266
  return "", history + [(question, response)]
267
 
268
  with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
269
- gr.Markdown("# 🌿 AskNature RAG-based Chatbot ")
270
  with gr.Row():
271
  chatbot = gr.Chatbot(label="Dialogue History", height=500)
272
  with gr.Row():
273
- question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')",
274
- label="Inquiry", scale=4)
275
- model_selector = gr.Dropdown(choices=["gemini-2.0-flash", "meta-llama-3-70b-instruct"], label="Generation Model", value="gemini-2.0-flash")
276
  clear_btn = gr.Button("Clear History", variant="secondary")
277
-
278
  gr.Markdown("""
279
  <div style="text-align: center; color: #4a7c59;">
280
  <small>Powered by AskNature's Database |
281
  Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
282
  </div>""")
 
283
  question.submit(chat_interface, [question, chatbot, model_selector], [question, chatbot])
284
  clear_btn.click(lambda: [], None, chatbot)
285
 
286
  if __name__ == "__main__":
287
- demo.launch(show_error=True)
 
1
+ # Combined Gemini Flash and Meta-LLAMA 3 GWDG and Groq Chatbot
2
+ # For Gemini Flash rate limit is 15 requests per minute
3
+ # For Groq rate 30 RPM , 14400 RPD, 6K TPM and 500K TPD
4
+
5
+ import os
6
  import json
7
  import logging
8
  import re
9
+ from typing import List, Tuple
 
 
10
  import gradio as gr
11
  from openai import OpenAI
12
  import google.generativeai as genai
13
+ import requests
14
  from functools import lru_cache
15
  from tenacity import retry, stop_after_attempt, wait_exponential
16
  from langchain_community.retrievers import BM25Retriever
 
207
  context = []
208
  for doc in docs:
209
  context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
210
+ **Application**: {doc.metadata['application']}
211
+ **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
212
+ **Strategy Excerpt**:
213
+ {doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
214
  context.append(context_str)
215
  return "\n\n---\n\n".join(context)
216
 
217
  # --- Generation System ---
218
+ SYSTEM_PROMPT = """
219
+ **Biomimicry Expert Guidelines**
220
+
221
+ - Use only the provided AskNature context (e.g., Source, Application, Strategy, technical_concepts). If no context is given, note that you're using your own expertise.
222
+ - When referencing facts, use numeric citations in square brackets (e.g., [1]). Do not include full URLs inline.
223
+ - Bold all technical terms (e.g., **protein-based pigmentation**, **DNA-level fiber design**).
224
+ - Provide a concise, expert answer that explains the innovation and its sustainability benefits.
225
+ - End your response with a "References" section listing each URL with its citation number.
226
 
227
+ Context: {context}
228
+ """
229
 
230
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
231
  def get_ai_response(query: str, context: str, model: str) -> str:
232
+ result = "" # Initialize the result variable
233
  try:
234
  if model == "gemini-2.0-flash":
235
  gemini_model = genai.GenerativeModel(model)
236
  response = gemini_model.generate_content(
237
  f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
238
  )
239
+ logger.info(f"Response from gemini-2.0-flash: {response.text}")
240
+ result = _postprocess_response(response.text)
241
  elif model == "meta-llama-3-70b-instruct":
242
  response = client.chat.completions.create(
243
  model=model,
 
248
  temperature=0.4,
249
  max_tokens=2000
250
  )
251
+ logger.info(f"Response from meta-llama-3-70b-instruct: {response}")
252
+ try:
253
+ result = response.choices[0].message.content
254
+ except Exception as e:
255
+ logger.error(f"Error processing meta-llama-3-70b-instruct response: {str(e)}")
256
+ result = "Failed to process response from meta-llama-3-70b-instruct"
257
+ elif model == "llama3-70b-8192":
258
+ result = get_groq_llama3_response(query)
259
+ logger.info(f"Response from llama3-70b-8192: {result}")
260
+ if result is None:
261
+ result = "Failed to get response from llama3-70b-8192"
262
+ # Append the model name to the response for clarity
263
+ result += f"\n\n**Model:** {model}"
264
+ return result
265
  except Exception as e:
266
  logger.error(f"Generation Error: {str(e)}")
267
  return "I'm unable to generate a response right now. Please try again later."
 
271
  response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
272
  return response
273
 
274
+ def get_groq_llama3_response(query: str) -> str:
275
+ """Get response from Llama 3 on Groq Cloud."""
276
+ api_key = os.getenv("GROQ_API_KEY")
277
+ url = "https://api.groq.com/openai/v1/chat/completions"
278
+
279
+ headers = {
280
+ "Content-Type": "application/json",
281
+ "Authorization": f"Bearer {api_key}"
282
+ }
283
+
284
+ payload = {
285
+ "model": "llama3-70b-8192",
286
+ "messages": [
287
+ {
288
+ "role": "user",
289
+ "content": query
290
+ }
291
+ ]
292
+ }
293
+
294
+ try:
295
+ response = requests.post(url, headers=headers, json=payload)
296
+ response.raise_for_status()
297
+ result = response.json()
298
+ logger.info(f"Groq API Response: {result}")
299
+ return result["choices"][0]["message"]["content"]
300
+ except requests.exceptions.RequestException as e:
301
+ logger.error(f"Groq API Error: {str(e)}")
302
+ return "An error occurred while contacting Groq's Llama 3 model."
303
+
304
  # --- Pipeline ---
305
  documents = load_and_chunk_data(data_file_name)
306
  retriever = EnhancedRetriever(documents)
 
319
  return "", history + [(question, response)]
320
 
321
  with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
322
+ gr.Markdown("# 🌿 AskNature RAG-based Chatbot")
323
  with gr.Row():
324
  chatbot = gr.Chatbot(label="Dialogue History", height=500)
325
  with gr.Row():
326
+ question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')", label="Inquiry", scale=4)
327
+ model_selector = gr.Dropdown(choices=["gemini-2.0-flash", "meta-llama-3-70b-instruct(GWDG)", "llama3-70b-8192(Groq)"], label="Generation Model", value="gemini-2.0-flash")
 
328
  clear_btn = gr.Button("Clear History", variant="secondary")
329
+
330
  gr.Markdown("""
331
  <div style="text-align: center; color: #4a7c59;">
332
  <small>Powered by AskNature's Database |
333
  Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
334
  </div>""")
335
+
336
  question.submit(chat_interface, [question, chatbot, model_selector], [question, chatbot])
337
  clear_btn.click(lambda: [], None, chatbot)
338
 
339
  if __name__ == "__main__":
340
+ demo.launch(show_error=True)
main.ipynb CHANGED
@@ -873,6 +873,315 @@
873
  "outputs": [],
874
  "source": []
875
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  {
877
  "cell_type": "code",
878
  "execution_count": null,
@@ -885,8 +1194,6 @@
885
  "INFO:__main__:Loading cached documents\n",
886
  "INFO:__main__:Loading cached BM25 index\n",
887
  "INFO:__main__:Loading cached FAISS index\n",
888
- "INFO:faiss.loader:Loading faiss with AVX2 support.\n",
889
- "INFO:faiss.loader:Successfully loaded faiss with AVX2 support.\n",
890
  "c:\\Users\\Mohamed Elsafty\\.conda\\envs\\rag\\Lib\\site-packages\\gradio\\components\\chatbot.py:273: UserWarning: You have not specified a value for the `type` parameter. Defaulting to the 'tuples' format for chatbot messages, but this is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style dictionaries with 'role' and 'content' keys.\n",
891
  " warnings.warn(\n"
892
  ]
@@ -895,16 +1202,15 @@
895
  "name": "stdout",
896
  "output_type": "stream",
897
  "text": [
898
- "* Running on local URL: http://127.0.0.1:7860\n"
899
  ]
900
  },
901
  {
902
  "name": "stderr",
903
  "output_type": "stream",
904
  "text": [
905
- "INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version \"HTTP/1.1 200 OK\"\n",
906
- "INFO:httpx:HTTP Request: GET http://127.0.0.1:7860/gradio_api/startup-events \"HTTP/1.1 200 OK\"\n",
907
- "INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7860/ \"HTTP/1.1 200 OK\"\n"
908
  ]
909
  },
910
  {
@@ -918,7 +1224,7 @@
918
  {
919
  "data": {
920
  "text/html": [
921
- "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
922
  ],
923
  "text/plain": [
924
  "<IPython.core.display.HTML object>"
@@ -931,24 +1237,50 @@
931
  "name": "stderr",
932
  "output_type": "stream",
933
  "text": [
 
934
  "INFO:httpx:HTTP Request: POST https://chat-ai.academiccloud.de/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
935
  "Embedding Progress: 0%| | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://chat-ai.academiccloud.de/v1/embeddings \"HTTP/1.1 200 OK\"\n",
936
- "Embedding Progress: 100%|██████████| 1/1 [00:00<00:00, 4.64it/s]\n",
937
- "INFO:httpx:HTTP Request: POST https://chat-ai.academiccloud.de/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
  ]
939
  }
940
  ],
941
  "source": [
942
- "# Combined Llama 3 and Gemini Flash Chatbot\n",
 
 
 
 
943
  "import json\n",
944
  "import logging\n",
945
  "import re\n",
946
- "import os\n",
947
- "import pickle\n",
948
- "from typing import List, Tuple, Optional\n",
949
  "import gradio as gr\n",
950
  "from openai import OpenAI\n",
951
  "import google.generativeai as genai\n",
 
952
  "from functools import lru_cache\n",
953
  "from tenacity import retry, stop_after_attempt, wait_exponential\n",
954
  "from langchain_community.retrievers import BM25Retriever\n",
@@ -1145,30 +1477,37 @@
1145
  " context = []\n",
1146
  " for doc in docs:\n",
1147
  " context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
1148
- " **Application**: {doc.metadata['application']}\n",
1149
- " **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
1150
- " **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
 
1151
  " context.append(context_str)\n",
1152
  " return \"\\n\\n---\\n\\n\".join(context)\n",
1153
  "\n",
1154
  "# --- Generation System ---\n",
1155
- "SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
1156
- "1. Base answers strictly on context\n",
1157
- "2. Cite sources as [Source]\n",
1158
- "3. **Bold** technical terms\n",
1159
- "4. Include reference links\n",
1160
  "\n",
1161
- "Context: {context}\"\"\"\n",
 
 
 
 
 
 
 
1162
  "\n",
1163
  "@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
1164
  "def get_ai_response(query: str, context: str, model: str) -> str:\n",
 
1165
  " try:\n",
1166
  " if model == \"gemini-2.0-flash\":\n",
1167
  " gemini_model = genai.GenerativeModel(model)\n",
1168
  " response = gemini_model.generate_content(\n",
1169
  " f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
1170
  " )\n",
1171
- " return _postprocess_response(response.text)\n",
 
1172
  " elif model == \"meta-llama-3-70b-instruct\":\n",
1173
  " response = client.chat.completions.create(\n",
1174
  " model=model,\n",
@@ -1179,7 +1518,20 @@
1179
  " temperature=0.4,\n",
1180
  " max_tokens=2000\n",
1181
  " )\n",
1182
- " return _postprocess_response(response.choices[0].message.content)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
1183
  " except Exception as e:\n",
1184
  " logger.error(f\"Generation Error: {str(e)}\")\n",
1185
  " return \"I'm unable to generate a response right now. Please try again later.\"\n",
@@ -1189,6 +1541,36 @@
1189
  " response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
1190
  " return response\n",
1191
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1192
  "# --- Pipeline ---\n",
1193
  "documents = load_and_chunk_data(data_file_name)\n",
1194
  "retriever = EnhancedRetriever(documents)\n",
@@ -1207,62 +1589,27 @@
1207
  " return \"\", history + [(question, response)]\n",
1208
  "\n",
1209
  "with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
1210
- " gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
1211
  " with gr.Row():\n",
1212
  " chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
1213
  " with gr.Row():\n",
1214
- " question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
1215
- " label=\"Inquiry\", scale=4)\n",
1216
- " model_selector = gr.Dropdown(choices=[\"gemini-2.0-flash\", \"meta-llama-3-70b-instruct\"], label=\"Generation Model\", value=\"gemini-2.0-flash\")\n",
1217
  " clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
1218
- " \n",
1219
  " gr.Markdown(\"\"\"\n",
1220
  " <div style=\"text-align: center; color: #4a7c59;\">\n",
1221
  " <small>Powered by AskNature's Database | \n",
1222
  " Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
1223
  " </div>\"\"\")\n",
 
1224
  " question.submit(chat_interface, [question, chatbot, model_selector], [question, chatbot])\n",
1225
  " clear_btn.click(lambda: [], None, chatbot)\n",
1226
  "\n",
1227
  "if __name__ == \"__main__\":\n",
1228
- " demo.launch(show_error=True)"
1229
  ]
1230
  },
1231
- {
1232
- "cell_type": "code",
1233
- "execution_count": null,
1234
- "metadata": {},
1235
- "outputs": [],
1236
- "source": []
1237
- },
1238
- {
1239
- "cell_type": "code",
1240
- "execution_count": null,
1241
- "metadata": {},
1242
- "outputs": [],
1243
- "source": []
1244
- },
1245
- {
1246
- "cell_type": "code",
1247
- "execution_count": null,
1248
- "metadata": {},
1249
- "outputs": [],
1250
- "source": []
1251
- },
1252
- {
1253
- "cell_type": "code",
1254
- "execution_count": null,
1255
- "metadata": {},
1256
- "outputs": [],
1257
- "source": []
1258
- },
1259
- {
1260
- "cell_type": "code",
1261
- "execution_count": null,
1262
- "metadata": {},
1263
- "outputs": [],
1264
- "source": []
1265
- },
1266
  {
1267
  "cell_type": "code",
1268
  "execution_count": null,
 
873
  "outputs": [],
874
  "source": []
875
  },
876
+ {
877
+ "cell_type": "code",
878
+ "execution_count": null,
879
+ "metadata": {},
880
+ "outputs": [],
881
+ "source": [
882
+ "# Combined Llama 3 and Gemini Flash Chatbot\n",
883
+ "import json\n",
884
+ "import logging\n",
885
+ "import re\n",
886
+ "import os\n",
887
+ "import pickle\n",
888
+ "from typing import List, Tuple, Optional\n",
889
+ "import gradio as gr\n",
890
+ "from openai import OpenAI\n",
891
+ "import google.generativeai as genai\n",
892
+ "from functools import lru_cache\n",
893
+ "from tenacity import retry, stop_after_attempt, wait_exponential\n",
894
+ "from langchain_community.retrievers import BM25Retriever\n",
895
+ "from langchain_community.vectorstores import FAISS\n",
896
+ "from langchain_core.embeddings import Embeddings\n",
897
+ "from langchain_core.documents import Document\n",
898
+ "from collections import defaultdict\n",
899
+ "import hashlib\n",
900
+ "from tqdm import tqdm\n",
901
+ "from dotenv import load_dotenv\n",
902
+ "\n",
903
+ "load_dotenv()\n",
904
+ "\n",
905
+ "# --- Configuration ---\n",
906
+ "FAISS_INDEX_PATH = \"faiss_index\"\n",
907
+ "BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
908
+ "CACHE_VERSION = \"v1\"\n",
909
+ "embedding_model = \"e5-mistral-7b-instruct\"\n",
910
+ "data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
911
+ "CHUNK_SIZE = 800\n",
912
+ "OVERLAP = 200\n",
913
+ "EMBEDDING_BATCH_SIZE = 32\n",
914
+ "\n",
915
+ "# Initialize clients\n",
916
+ "OPENAI_API_CONFIG = {\n",
917
+ " \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
918
+ " \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
919
+ "}\n",
920
+ "client = OpenAI(**OPENAI_API_CONFIG)\n",
921
+ "genai.configure(api_key=os.getenv(\"GEMINI_API_KEY\"))\n",
922
+ "\n",
923
+ "logging.basicConfig(level=logging.INFO)\n",
924
+ "logger = logging.getLogger(__name__)\n",
925
+ "\n",
926
+ "# --- Helper Functions ---\n",
927
+ "def get_data_hash(file_path: str) -> str:\n",
928
+ " \"\"\"Generate hash of data file for cache validation\"\"\"\n",
929
+ " with open(file_path, \"rb\") as f:\n",
930
+ " return hashlib.md5(f.read()).hexdigest()\n",
931
+ "\n",
932
+ "# --- Custom Embedding Handler ---\n",
933
+ "class MistralEmbeddings(Embeddings):\n",
934
+ " \"\"\"E5-Mistral-7B embedding adapter\"\"\"\n",
935
+ " def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
936
+ " embeddings = []\n",
937
+ " try:\n",
938
+ " for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
939
+ " batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
940
+ " response = client.embeddings.create(\n",
941
+ " input=batch,\n",
942
+ " model=embedding_model,\n",
943
+ " encoding_format=\"float\"\n",
944
+ " )\n",
945
+ " embeddings.extend([e.embedding for e in response.data])\n",
946
+ " return embeddings\n",
947
+ " except Exception as e:\n",
948
+ " logger.error(f\"Embedding Error: {str(e)}\")\n",
949
+ " return [[] for _ in texts]\n",
950
+ " \n",
951
+ " def embed_query(self, text: str) -> List[float]:\n",
952
+ " return self.embed_documents([text])[0]\n",
953
+ "\n",
954
+ "# --- Data Processing ---\n",
955
+ "def load_and_chunk_data(file_path: str) -> List[Document]:\n",
956
+ " \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
957
+ " current_hash = get_data_hash(file_path)\n",
958
+ " cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
959
+ " \n",
960
+ " if os.path.exists(cache_file):\n",
961
+ " logger.info(\"Loading cached documents\")\n",
962
+ " with open(cache_file, \"rb\") as f:\n",
963
+ " return pickle.load(f)\n",
964
+ " \n",
965
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
966
+ " data = json.load(f)\n",
967
+ " \n",
968
+ " documents = []\n",
969
+ " for item in tqdm(data, desc=\"Chunking Progress\"):\n",
970
+ " base_content = f\"\"\"Source: {item['Source']}\n",
971
+ "Application: {item['Application']}\n",
972
+ "Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
973
+ "Technical Concepts: {', '.join(item['technical_concepts'])}\n",
974
+ "Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
975
+ " \n",
976
+ " strategy = item['Strategy']\n",
977
+ " for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
978
+ " chunk = strategy[i:i + CHUNK_SIZE]\n",
979
+ " documents.append(Document(\n",
980
+ " page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
981
+ " metadata={\n",
982
+ " \"source\": item[\"Source\"],\n",
983
+ " \"application\": item[\"Application\"],\n",
984
+ " \"technical_concepts\": item[\"technical_concepts\"],\n",
985
+ " \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
986
+ " \"hyperlink\": item[\"Hyperlink\"],\n",
987
+ " \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
988
+ " }\n",
989
+ " ))\n",
990
+ " \n",
991
+ " with open(cache_file, \"wb\") as f:\n",
992
+ " pickle.dump(documents, f)\n",
993
+ " return documents\n",
994
+ "\n",
995
+ "# --- Optimized Retrieval System ---\n",
996
+ "class EnhancedRetriever:\n",
997
+ " \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
998
+ " def __init__(self, documents: List[Document]):\n",
999
+ " self.documents = documents\n",
1000
+ " self.bm25 = self._init_bm25()\n",
1001
+ " self.vector_store = self._init_faiss()\n",
1002
+ " self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
1003
+ "\n",
1004
+ " def _init_bm25(self) -> BM25Retriever:\n",
1005
+ " cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
1006
+ " if os.path.exists(cache_key):\n",
1007
+ " logger.info(\"Loading cached BM25 index\")\n",
1008
+ " with open(cache_key, \"rb\") as f:\n",
1009
+ " return pickle.load(f)\n",
1010
+ " \n",
1011
+ " logger.info(\"Building new BM25 index\")\n",
1012
+ " retriever = BM25Retriever.from_documents(self.documents)\n",
1013
+ " retriever.k = 5\n",
1014
+ " with open(cache_key, \"wb\") as f:\n",
1015
+ " pickle.dump(retriever, f)\n",
1016
+ " return retriever\n",
1017
+ "\n",
1018
+ " def _init_faiss(self) -> FAISS:\n",
1019
+ " cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
1020
+ " if os.path.exists(cache_key):\n",
1021
+ " logger.info(\"Loading cached FAISS index\")\n",
1022
+ " return FAISS.load_local(\n",
1023
+ " cache_key,\n",
1024
+ " MistralEmbeddings(),\n",
1025
+ " allow_dangerous_deserialization=True\n",
1026
+ " )\n",
1027
+ " \n",
1028
+ " logger.info(\"Building new FAISS index\")\n",
1029
+ " vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
1030
+ " vector_store.save_local(cache_key)\n",
1031
+ " return vector_store\n",
1032
+ "\n",
1033
+ " @lru_cache(maxsize=500)\n",
1034
+ " def retrieve(self, query: str) -> str:\n",
1035
+ " try:\n",
1036
+ " processed_query = self._preprocess_query(query)\n",
1037
+ " expanded_query = self._hyde_expansion(processed_query)\n",
1038
+ " \n",
1039
+ " bm25_results = self.bm25.invoke(processed_query)\n",
1040
+ " vector_results = self.vector_retriever.invoke(processed_query)\n",
1041
+ " expanded_results = self.bm25.invoke(expanded_query)\n",
1042
+ " \n",
1043
+ " fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
1044
+ " return self._format_context(fused_results[:5])\n",
1045
+ " except Exception as e:\n",
1046
+ " logger.error(f\"Retrieval Error: {str(e)}\")\n",
1047
+ " return \"\"\n",
1048
+ "\n",
1049
+ " def _preprocess_query(self, query: str) -> str:\n",
1050
+ " return query.lower().strip()\n",
1051
+ "\n",
1052
+ " @lru_cache(maxsize=500)\n",
1053
+ " def _hyde_expansion(self, query: str) -> str:\n",
1054
+ " try:\n",
1055
+ " response = client.chat.completions.create(\n",
1056
+ " model=\"meta-llama-3-70b-instruct\",\n",
1057
+ " messages=[{\n",
1058
+ " \"role\": \"user\",\n",
1059
+ " \"content\": f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
1060
+ " }],\n",
1061
+ " temperature=0.5,\n",
1062
+ " max_tokens=200\n",
1063
+ " )\n",
1064
+ " return response.choices[0].message.content\n",
1065
+ " except Exception as e:\n",
1066
+ " logger.error(f\"HyDE Error: {str(e)}\")\n",
1067
+ " return query\n",
1068
+ "\n",
1069
+ " def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
1070
+ " fused_scores = defaultdict(float)\n",
1071
+ " for docs in result_sets:\n",
1072
+ " for rank, doc in enumerate(docs, 1):\n",
1073
+ " fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
1074
+ " \n",
1075
+ " seen = set()\n",
1076
+ " return [\n",
1077
+ " doc for doc in sorted(\n",
1078
+ " (doc for docs in result_sets for doc in docs),\n",
1079
+ " key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
1080
+ " reverse=True\n",
1081
+ " ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
1082
+ " ]\n",
1083
+ "\n",
1084
+ " def _format_context(self, docs: List[Document]) -> str:\n",
1085
+ " context = []\n",
1086
+ " for doc in docs:\n",
1087
+ " context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
1088
+ " **Application**: {doc.metadata['application']}\n",
1089
+ " **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
1090
+ " **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
1091
+ " context.append(context_str)\n",
1092
+ " return \"\\n\\n---\\n\\n\".join(context)\n",
1093
+ "\n",
1094
+ "# --- Generation System ---\n",
1095
+ "SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
1096
+ "1. Firstly Base answers strictly on context and if there is not context answer by your own.\n",
1097
+ "2. Cite sources as [Source] witht the hyperlink\n",
1098
+ "3. **Bold** technical terms\n",
1099
+ "4. Include reference links at the end of the response\n",
1100
+ "\n",
1101
+ "Context: {context}\"\"\"\n",
1102
+ "\n",
1103
+ "@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
1104
+ "def get_ai_response(query: str, context: str, model: str) -> str:\n",
1105
+ " try:\n",
1106
+ " if model == \"gemini-2.0-flash\":\n",
1107
+ " gemini_model = genai.GenerativeModel(model)\n",
1108
+ " response = gemini_model.generate_content(\n",
1109
+ " f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
1110
+ " )\n",
1111
+ " return _postprocess_response(response.text)\n",
1112
+ " elif model == \"meta-llama-3-70b-instruct\":\n",
1113
+ " response = client.chat.completions.create(\n",
1114
+ " model=model,\n",
1115
+ " messages=[\n",
1116
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT.format(context=context)},\n",
1117
+ " {\"role\": \"user\", \"content\": f\"Question: {query}\\nProvide a detailed technical answer:\"}\n",
1118
+ " ],\n",
1119
+ " temperature=0.4,\n",
1120
+ " max_tokens=2000\n",
1121
+ " )\n",
1122
+ " return _postprocess_response(response.choices[0].message.content)\n",
1123
+ " except Exception as e:\n",
1124
+ " logger.error(f\"Generation Error: {str(e)}\")\n",
1125
+ " return \"I'm unable to generate a response right now. Please try again later.\"\n",
1126
+ "\n",
1127
+ "def _postprocess_response(response: str) -> str:\n",
1128
+ " response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
1129
+ " response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
1130
+ " return response\n",
1131
+ "\n",
1132
+ "# --- Pipeline ---\n",
1133
+ "documents = load_and_chunk_data(data_file_name)\n",
1134
+ "retriever = EnhancedRetriever(documents)\n",
1135
+ "\n",
1136
+ "def generate_response(question: str, model: str) -> str:\n",
1137
+ " try:\n",
1138
+ " context = retriever.retrieve(question)\n",
1139
+ " return get_ai_response(question, context, model) if context else \"No relevant information found.\"\n",
1140
+ " except Exception as e:\n",
1141
+ " logger.error(f\"Pipeline Error: {str(e)}\")\n",
1142
+ " return \"An error occurred processing your request.\"\n",
1143
+ "\n",
1144
+ "# --- Gradio Interface ---\n",
1145
+ "def chat_interface(question: str, history: List[Tuple[str, str]], model: str):\n",
1146
+ " response = generate_response(question, model)\n",
1147
+ " return \"\", history + [(question, response)]\n",
1148
+ "\n",
1149
+ "with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
1150
+ " gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
1151
+ " with gr.Row():\n",
1152
+ " chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
1153
+ " with gr.Row():\n",
1154
+ " question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
1155
+ " label=\"Inquiry\", scale=4)\n",
1156
+ " model_selector = gr.Dropdown(choices=[\"gemini-2.0-flash\", \"meta-llama-3-70b-instruct\"], label=\"Generation Model\", value=\"gemini-2.0-flash\")\n",
1157
+ " clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
1158
+ " \n",
1159
+ " gr.Markdown(\"\"\"\n",
1160
+ " <div style=\"text-align: center; color: #4a7c59;\">\n",
1161
+ " <small>Powered by AskNature's Database | \n",
1162
+ " Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
1163
+ " </div>\"\"\")\n",
1164
+ " question.submit(chat_interface, [question, chatbot, model_selector], [question, chatbot])\n",
1165
+ " clear_btn.click(lambda: [], None, chatbot)\n",
1166
+ "\n",
1167
+ "if __name__ == \"__main__\":\n",
1168
+ " demo.launch(show_error=True)"
1169
+ ]
1170
+ },
1171
+ {
1172
+ "cell_type": "code",
1173
+ "execution_count": null,
1174
+ "metadata": {},
1175
+ "outputs": [],
1176
+ "source": []
1177
+ },
1178
+ {
1179
+ "cell_type": "code",
1180
+ "execution_count": null,
1181
+ "metadata": {},
1182
+ "outputs": [],
1183
+ "source": []
1184
+ },
1185
  {
1186
  "cell_type": "code",
1187
  "execution_count": null,
 
1194
  "INFO:__main__:Loading cached documents\n",
1195
  "INFO:__main__:Loading cached BM25 index\n",
1196
  "INFO:__main__:Loading cached FAISS index\n",
 
 
1197
  "c:\\Users\\Mohamed Elsafty\\.conda\\envs\\rag\\Lib\\site-packages\\gradio\\components\\chatbot.py:273: UserWarning: You have not specified a value for the `type` parameter. Defaulting to the 'tuples' format for chatbot messages, but this is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style dictionaries with 'role' and 'content' keys.\n",
1198
  " warnings.warn(\n"
1199
  ]
 
1202
  "name": "stdout",
1203
  "output_type": "stream",
1204
  "text": [
1205
+ "* Running on local URL: http://127.0.0.1:7861\n"
1206
  ]
1207
  },
1208
  {
1209
  "name": "stderr",
1210
  "output_type": "stream",
1211
  "text": [
1212
+ "INFO:httpx:HTTP Request: GET http://127.0.0.1:7861/gradio_api/startup-events \"HTTP/1.1 200 OK\"\n",
1213
+ "INFO:httpx:HTTP Request: HEAD http://127.0.0.1:7861/ \"HTTP/1.1 200 OK\"\n"
 
1214
  ]
1215
  },
1216
  {
 
1224
  {
1225
  "data": {
1226
  "text/html": [
1227
+ "<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
1228
  ],
1229
  "text/plain": [
1230
  "<IPython.core.display.HTML object>"
 
1237
  "name": "stderr",
1238
  "output_type": "stream",
1239
  "text": [
1240
+ "INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version \"HTTP/1.1 200 OK\"\n",
1241
  "INFO:httpx:HTTP Request: POST https://chat-ai.academiccloud.de/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
1242
  "Embedding Progress: 0%| | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://chat-ai.academiccloud.de/v1/embeddings \"HTTP/1.1 200 OK\"\n",
1243
+ "Embedding Progress: 100%|██████████| 1/1 [00:00<00:00, 6.02it/s]\n",
1244
+ "INFO:__main__:Response from gemini-2.0-flash: Inspired by the **protein-based pigmentation** found in *Discosoma* coral, Werewool is developing biodegradable fibers for the textile industry [1]. This innovation utilizes **DNA-level fiber design** and **biotechnology** to create sustainable textiles with inherent color and other properties, eliminating the need for toxic dyes and petroleum-based synthetics [1]. The *Discosoma* coral's **red fluorescent protein (RFP)** serves as a model for creating color within the fiber itself [1].\n",
1245
+ "\n",
1246
+ "Furthermore, corals utilize **coral acid-rich proteins (CARP)** to build their rock-hard reefs [2]. These proteins interact with minerals like calcium carbonate, shaping them into **aragonite** crystals [2]. The corals then combine these crystals with other organic molecules, acting as cement to form the strong coral skeletons [2]. This **protein-mineral interaction**, along with the **molecular scaffolding** provided by proteins such as **collagen-based structure**, inspires the creation of durable materials [2].\n",
1247
+ "\n",
1248
+ "These approaches reduce the textile industry's reliance on harmful chemicals and unsustainable materials [1]. The global textile market produces 1.2 billion tons of CO2 equivalent per year and uses dyes that are responsible for 20% of global wastewater. The industry also depends on petroleum based synthetic fibers that account for 35% of global microplastic pollution [1].\n",
1249
+ "\n",
1250
+ "References:\n",
1251
+ "[1] https://asknature.org/innovation/colorful-fibers-inspired-by-proteins-found-in-discosoma-coral/\n",
1252
+ "[2] https://asknature.org/strategy/how-proteins-help-corals-build-rock-hard-reefs/\n",
1253
+ "\n",
1254
+ "INFO:httpx:HTTP Request: POST https://chat-ai.academiccloud.de/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
1255
+ "INFO:__main__:Response from meta-llama-3-70b-instruct: ChatCompletion(id='chat-2b0d6684e9744a0ba5c48bad6bcdd4d7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"**Coral Proteins: A Sustainable Solution for Eco-Friendly Fabrics**\\n\\nThe Discosoma coral's protein, specifically the red fluorescent protein (RFP), has inspired a revolutionary approach to creating eco-friendly fabrics without the need for toxic dyes. This innovation leverages the principles of **protein-based pigmentation** and **DNA-level fiber design** to produce biodegradable fibers with inherent color properties.\\n\\n**The Science Behind Coral Proteins**\\n\\nThe RFP found in Discosoma coral is a naturally occurring protein that provides the coral with its vibrant color. This protein is capable of absorbing light and emitting it at a longer wavelength, resulting in a red fluorescent color. By harnessing the properties of this protein, researchers can create fibers that exhibit similar color characteristics without the need for synthetic dyes.\\n\\n**How Coral Proteins Are Used in Fabric Production**\\n\\nTo create eco-friendly fabrics, researchers use biotechnology to design fibers at the DNA level. This involves identifying the protein structures found in nature, such as the RFP, and incorporating them into the fiber production process. The resulting fibers are reliant on these proteins for their color properties, eliminating the need for toxic dyes and finishes.\\n\\n**The Benefits of Coral Protein-Based Fabrics**\\n\\nThe use of coral proteins in fabric production offers several sustainability benefits, including:\\n\\n1. **Reduced environmental impact**: By eliminating the need for synthetic dyes and finishes, coral protein-based fabrics reduce the amount of toxic chemicals released into the environment.\\n2. **Biodegradable**: The fibers produced using coral proteins are biodegradable, reducing the amount of microplastic pollution in the environment.\\n3. **Sustainable production**: The use of biotechnology and natural proteins reduces the reliance on non-renewable resources and minimizes the carbon footprint of fabric production.\\n\\n**Conclusion**\\n\\nThe use of coral proteins in fabric production represents a significant step towards creating eco-friendly and sustainable textiles. By harnessing the power of nature and leveraging the principles of protein-based pigmentation and DNA-level fiber design, researchers can create biodegradable fibers with inherent color properties, reducing the environmental impact of the textile industry.\\n\\nReferences:\\n[1] https://asknature.org/innovation/colorful-fibers-inspired-by-proteins-found-in-discosoma-coral/\", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[]), stop_reason=None)], created=1739018886, model='meta-llama-3.1-70b-instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=459, prompt_tokens=1341, total_tokens=1800, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)\n",
1256
+ "INFO:__main__:Groq API Response: {'id': 'chatcmpl-55dc0b3b-1528-416e-bd30-93b4a851484e', 'object': 'chat.completion', 'created': 1739018907, 'model': 'llama3-70b-8192', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': \"Coral proteins are not directly involved in the production of eco-friendly fabrics without dyes. However, coral-inspired technologies have led to the development of innovative textile production methods that reduce the need for synthetic dyes.\\n\\nHere's how it works:\\n\\n1. **Biomimicry**: Scientists have taken inspiration from the vibrant colors and unique properties of coral reefs to develop sustainable textile production methods. Coral reefs are known for their incredible ability to display a range of colors without using pigments, instead relying on the structure of their skeletons to refract and scatter light.\\n2. **Structural coloration**: Researchers have replicated this natural phenomenon by developing fibers with nanostructured surfaces that create color through the manipulation of light. This approach, known as structural coloration, eliminates the need for synthetic dyes and pigments.\\n3. **Protein-based fibers**: Some companies are using protein-based fibers, such as those derived from silk, soy, or milk, to create sustainable textiles. These fibers can be engineered to have specific properties, such as structural coloration, that reduce the need for dyes.\\n4. **Natural coloration**: Other innovations involve using natural colorants, such as plant-based dyes, to create a more sustainable textile production process. These natural colorants can be derived from various sources, including fruits, vegetables, and spices.\\n\\nWhile coral proteins are not directly involved in this process, the biomimicry approach inspired by coral reefs has led to the development of innovative, eco-friendly textile production methods that reduce the need for synthetic dyes. These sustainable textiles have the potential to minimize environmental impacts, such as water pollution and energy consumption, associated with traditional dyeing processes.\"}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'queue_time': 0.016959538, 'prompt_tokens': 23, 'prompt_time': 0.004020428, 'completion_tokens': 339, 'completion_time': 1.021682599, 'total_tokens': 362, 'total_time': 1.025703027}, 'system_fingerprint': 'fp_753a4aecf6', 'x_groq': {'id': 'req_01jkjsnbgpf5n9ybppv09d1rb5'}}\n",
1257
+ "INFO:__main__:Response from llama3-70b-8192: Coral proteins are not directly involved in the production of eco-friendly fabrics without dyes. However, coral-inspired technologies have led to the development of innovative textile production methods that reduce the need for synthetic dyes.\n",
1258
+ "\n",
1259
+ "Here's how it works:\n",
1260
+ "\n",
1261
+ "1. **Biomimicry**: Scientists have taken inspiration from the vibrant colors and unique properties of coral reefs to develop sustainable textile production methods. Coral reefs are known for their incredible ability to display a range of colors without using pigments, instead relying on the structure of their skeletons to refract and scatter light.\n",
1262
+ "2. **Structural coloration**: Researchers have replicated this natural phenomenon by developing fibers with nanostructured surfaces that create color through the manipulation of light. This approach, known as structural coloration, eliminates the need for synthetic dyes and pigments.\n",
1263
+ "3. **Protein-based fibers**: Some companies are using protein-based fibers, such as those derived from silk, soy, or milk, to create sustainable textiles. These fibers can be engineered to have specific properties, such as structural coloration, that reduce the need for dyes.\n",
1264
+ "4. **Natural coloration**: Other innovations involve using natural colorants, such as plant-based dyes, to create a more sustainable textile production process. These natural colorants can be derived from various sources, including fruits, vegetables, and spices.\n",
1265
+ "\n",
1266
+ "While coral proteins are not directly involved in this process, the biomimicry approach inspired by coral reefs has led to the development of innovative, eco-friendly textile production methods that reduce the need for synthetic dyes. These sustainable textiles have the potential to minimize environmental impacts, such as water pollution and energy consumption, associated with traditional dyeing processes.\n"
1267
  ]
1268
  }
1269
  ],
1270
  "source": [
1271
+ "# Combined Gemini Flash and Meta-LLAMA 3 GWDG and Groq Chatbot\n",
1272
+ "# For Gemini Flash rate limit is 15 requests per minute\n",
1273
+ "# For Groq rate 30 RPM , 14400 RPD, 6K TPM and 500K TPD\n",
1274
+ "\n",
1275
+ "import os\n",
1276
  "import json\n",
1277
  "import logging\n",
1278
  "import re\n",
1279
+ "from typing import List, Tuple\n",
 
 
1280
  "import gradio as gr\n",
1281
  "from openai import OpenAI\n",
1282
  "import google.generativeai as genai\n",
1283
+ "import requests\n",
1284
  "from functools import lru_cache\n",
1285
  "from tenacity import retry, stop_after_attempt, wait_exponential\n",
1286
  "from langchain_community.retrievers import BM25Retriever\n",
 
1477
  " context = []\n",
1478
  " for doc in docs:\n",
1479
  " context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
1480
+ "**Application**: {doc.metadata['application']}\n",
1481
+ "**Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
1482
+ "**Strategy Excerpt**:\n",
1483
+ "{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
1484
  " context.append(context_str)\n",
1485
  " return \"\\n\\n---\\n\\n\".join(context)\n",
1486
  "\n",
1487
  "# --- Generation System ---\n",
1488
+ "SYSTEM_PROMPT = \"\"\"\n",
1489
+ "**Biomimicry Expert Guidelines**\n",
 
 
 
1490
  "\n",
1491
+ "- Use only the provided AskNature context (e.g., Source, Application, Strategy, technical_concepts). If no context is given, note that you're using your own expertise.\n",
1492
+ "- When referencing facts, use numeric citations in square brackets (e.g., [1]). Do not include full URLs inline.\n",
1493
+ "- Bold all technical terms (e.g., **protein-based pigmentation**, **DNA-level fiber design**).\n",
1494
+ "- Provide a concise, expert answer that explains the innovation and its sustainability benefits.\n",
1495
+ "- End your response with a \"References\" section listing each URL with its citation number.\n",
1496
+ "\n",
1497
+ "Context: {context}\n",
1498
+ "\"\"\"\n",
1499
  "\n",
1500
  "@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
1501
  "def get_ai_response(query: str, context: str, model: str) -> str:\n",
1502
+ " result = \"\" # Initialize the result variable\n",
1503
  " try:\n",
1504
  " if model == \"gemini-2.0-flash\":\n",
1505
  " gemini_model = genai.GenerativeModel(model)\n",
1506
  " response = gemini_model.generate_content(\n",
1507
  " f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
1508
  " )\n",
1509
+ " logger.info(f\"Response from gemini-2.0-flash: {response.text}\")\n",
1510
+ " result = _postprocess_response(response.text)\n",
1511
  " elif model == \"meta-llama-3-70b-instruct\":\n",
1512
  " response = client.chat.completions.create(\n",
1513
  " model=model,\n",
 
1518
  " temperature=0.4,\n",
1519
  " max_tokens=2000\n",
1520
  " )\n",
1521
+ " logger.info(f\"Response from meta-llama-3-70b-instruct: {response}\")\n",
1522
+ " try:\n",
1523
+ " result = response.choices[0].message.content\n",
1524
+ " except Exception as e:\n",
1525
+ " logger.error(f\"Error processing meta-llama-3-70b-instruct response: {str(e)}\")\n",
1526
+ " result = \"Failed to process response from meta-llama-3-70b-instruct\"\n",
1527
+ " elif model == \"llama3-70b-8192\":\n",
1528
+ " result = get_groq_llama3_response(query)\n",
1529
+ " logger.info(f\"Response from llama3-70b-8192: {result}\")\n",
1530
+ " if result is None:\n",
1531
+ " result = \"Failed to get response from llama3-70b-8192\"\n",
1532
+ " # Append the model name to the response for clarity\n",
1533
+ " result += f\"\\n\\n**Model:** {model}\"\n",
1534
+ " return result\n",
1535
  " except Exception as e:\n",
1536
  " logger.error(f\"Generation Error: {str(e)}\")\n",
1537
  " return \"I'm unable to generate a response right now. Please try again later.\"\n",
 
1541
  " response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
1542
  " return response\n",
1543
  "\n",
1544
+ "def get_groq_llama3_response(query: str) -> str:\n",
1545
+ " \"\"\"Get response from Llama 3 on Groq Cloud.\"\"\"\n",
1546
+ " api_key = os.getenv(\"GROQ_API_KEY\")\n",
1547
+ " url = \"https://api.groq.com/openai/v1/chat/completions\"\n",
1548
+ " \n",
1549
+ " headers = {\n",
1550
+ " \"Content-Type\": \"application/json\",\n",
1551
+ " \"Authorization\": f\"Bearer {api_key}\"\n",
1552
+ " }\n",
1553
+ " \n",
1554
+ " payload = {\n",
1555
+ " \"model\": \"llama3-70b-8192\",\n",
1556
+ " \"messages\": [\n",
1557
+ " {\n",
1558
+ " \"role\": \"user\",\n",
1559
+ " \"content\": query\n",
1560
+ " }\n",
1561
+ " ]\n",
1562
+ " }\n",
1563
+ " \n",
1564
+ " try:\n",
1565
+ " response = requests.post(url, headers=headers, json=payload)\n",
1566
+ " response.raise_for_status()\n",
1567
+ " result = response.json()\n",
1568
+ " logger.info(f\"Groq API Response: {result}\")\n",
1569
+ " return result[\"choices\"][0][\"message\"][\"content\"]\n",
1570
+ " except requests.exceptions.RequestException as e:\n",
1571
+ " logger.error(f\"Groq API Error: {str(e)}\")\n",
1572
+ " return \"An error occurred while contacting Groq's Llama 3 model.\"\n",
1573
+ "\n",
1574
  "# --- Pipeline ---\n",
1575
  "documents = load_and_chunk_data(data_file_name)\n",
1576
  "retriever = EnhancedRetriever(documents)\n",
 
1589
  " return \"\", history + [(question, response)]\n",
1590
  "\n",
1591
  "with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
1592
+ " gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot\")\n",
1593
  " with gr.Row():\n",
1594
  " chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
1595
  " with gr.Row():\n",
1596
+ " question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\", label=\"Inquiry\", scale=4)\n",
1597
+ " model_selector = gr.Dropdown(choices=[\"gemini-2.0-flash\", \"meta-llama-3-70b-instruct(GWDG)\", \"llama3-70b-8192(Groq)\"], label=\"Generation Model\", value=\"gemini-2.0-flash\")\n",
 
1598
  " clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
1599
+ "\n",
1600
  " gr.Markdown(\"\"\"\n",
1601
  " <div style=\"text-align: center; color: #4a7c59;\">\n",
1602
  " <small>Powered by AskNature's Database | \n",
1603
  " Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
1604
  " </div>\"\"\")\n",
1605
+ " \n",
1606
  " question.submit(chat_interface, [question, chatbot, model_selector], [question, chatbot])\n",
1607
  " clear_btn.click(lambda: [], None, chatbot)\n",
1608
  "\n",
1609
  "if __name__ == \"__main__\":\n",
1610
+ " demo.launch(show_error=True)\n"
1611
  ]
1612
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1613
  {
1614
  "cell_type": "code",
1615
  "execution_count": null,