Mohamed284 commited on
Commit
60e98cb
·
verified ·
1 Parent(s): a6c8ffe

Upload 8 files

Browse files
.env ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
+ # API Configuration
3
+ OPENAI_API_KEY="d1c9ed1ca70b9721dee1087d93f9662a"
4
+ GEMINI_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
5
+ # GCP_PROJECT_ID="1008673779731"
6
+ # GCP_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
7
+
8
+ GEMINI_API_KEY_1= "AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
9
+ =======
10
+ # API Configuration
11
+ OPENAI_API_KEY="d1c9ed1ca70b9721dee1087d93f9662a"
12
+ GEMINI_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
13
+ # GCP_PROJECT_ID="1008673779731"
14
+ # GCP_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
15
+
16
+ GEMINI_API_KEY_1= "AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
17
+ >>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
18
+ GEMINI_API_KEY_2= "AIzaSyDzQSzM9vA6Le36V65I2meN5URclq4JSx0"
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ bm25_index.pkl_5c0c37d3cbc20e235eeec7cffd2d312f filter=lfs diff=lfs merge=lfs -text
2
+ documents_v1_5c0c37d3cbc20e235eeec7cffd2d312f.pkl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
+ ".env"
3
+ =======
4
+ ".env"
5
+ >>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
AskNatureNet_data_enhanced.json ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation
2
+ import json
3
+ import logging
4
+ import re
5
+ import os
6
+ import pickle
7
+ from typing import List, Tuple, Optional
8
+ import gradio as gr
9
+ from openai import OpenAI
10
+ from google import genai
11
+ from functools import lru_cache
12
+ from tenacity import retry, stop_after_attempt, wait_exponential
13
+ from langchain_community.retrievers import BM25Retriever
14
+ from langchain_community.vectorstores import FAISS
15
+ from langchain_core.embeddings import Embeddings
16
+ from langchain_core.documents import Document
17
+ from collections import defaultdict
18
+ import hashlib
19
+ from tqdm import tqdm
20
+
21
+ from dotenv import load_dotenv
22
+ load_dotenv()
23
+ # --- Configuration ---
24
+ FAISS_INDEX_PATH = "faiss_index"
25
+ BM25_INDEX_PATH = "bm25_index.pkl"
26
+ CACHE_VERSION = "v1" # Increment when data format changes
27
+ embedding_model = "e5-mistral-7b-instruct" # OpenAI embedding model
28
+ generation_model = "gemini-2.0-flash" # Gemini generation model
29
+ data_file_name = "AskNatureNet_data_enhanced.json"
30
+ API_CONFIG = {
31
+ "gemini_api_key": os.getenv("GEMINI_API_KEY") # Gemini API key for generation
32
+ }
33
+
34
+ CHUNK_SIZE = 800
35
+ OVERLAP = 200
36
+ EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls
37
+
38
+ # Initialize clients
39
+ OPENAI_API_CONFIG = {
40
+ "api_key": os.getenv("OPENAI_API_KEY"),
41
+ "base_url": "https://chat-ai.academiccloud.de/v1"
42
+ }
43
+ client = OpenAI(**OPENAI_API_CONFIG)
44
+ gemini_client = genai.Client(api_key=API_CONFIG["gemini_api_key"]) # Gemini client for generation
45
+ logging.basicConfig(level=logging.INFO)
46
+ logger = logging.getLogger(__name__)
47
+
48
+ # --- Helper Functions ---
49
+ def get_data_hash(file_path: str) -> str:
50
+ """Generate hash of data file for cache validation"""
51
+ with open(file_path, "rb") as f:
52
+ return hashlib.md5(f.read()).hexdigest()
53
+
54
+ # --- Custom Embedding Handler with Progress Tracking ---
55
+ class MistralEmbeddings(Embeddings):
56
+ """E5-Mistral-7B embedding adapter with error handling and progress tracking"""
57
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
58
+ embeddings = []
59
+ try:
60
+ # Process in batches with progress tracking
61
+ for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
62
+ batch = texts[i:i + EMBEDDING_BATCH_SIZE]
63
+ response = client.embeddings.create(
64
+ input=batch,
65
+ model=embedding_model,
66
+ encoding_format="float"
67
+ )
68
+ embeddings.extend([e.embedding for e in response.data])
69
+ return embeddings
70
+ except Exception as e:
71
+ logger.error(f"Embedding Error: {str(e)}")
72
+ return [[] for _ in texts]
73
+
74
+ def embed_query(self, text: str) -> List[float]:
75
+ return self.embed_documents([text])[0]
76
+
77
+ # --- Data Processing with Cache Validation ---
78
+ def load_and_chunk_data(file_path: str) -> List[Document]:
79
+ """Enhanced chunking with metadata preservation"""
80
+ current_hash = get_data_hash(file_path)
81
+ cache_file = f"documents_{CACHE_VERSION}_{current_hash}.pkl"
82
+
83
+ if os.path.exists(cache_file):
84
+ logger.info("Loading cached documents")
85
+ with open(cache_file, "rb") as f:
86
+ return pickle.load(f)
87
+
88
+ with open(file_path, 'r', encoding='utf-8') as f:
89
+ data = json.load(f)
90
+
91
+ documents = []
92
+ for item in tqdm(data, desc="Chunking Progress"):
93
+ base_content = f"""Source: {item['Source']}
94
+ Application: {item['Application']}
95
+ Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}
96
+ Technical Concepts: {', '.join(item['technical_concepts'])}
97
+ Biological Mechanisms: {', '.join(item['biological_mechanisms'])}"""
98
+
99
+ strategy = item['Strategy']
100
+ for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):
101
+ chunk = strategy[i:i + CHUNK_SIZE]
102
+ documents.append(Document(
103
+ page_content=f"{base_content}\nStrategy Excerpt:\n{chunk}",
104
+ metadata={
105
+ "source": item["Source"],
106
+ "application": item["Application"],
107
+ "technical_concepts": item["technical_concepts"],
108
+ "sustainability_impacts": item["sustainability_impacts"],
109
+ "hyperlink": item["Hyperlink"],
110
+ "chunk_id": f"{item['Source']}-{len(documents)+1}"
111
+ }
112
+ ))
113
+
114
+ with open(cache_file, "wb") as f:
115
+ pickle.dump(documents, f)
116
+ return documents
117
+
118
+ # --- Optimized Retrieval System ---
119
+ class EnhancedRetriever:
120
+ """Hybrid retriever with persistent caching"""
121
+ def __init__(self, documents: List[Document]):
122
+ self.documents = documents
123
+ self.bm25 = self._init_bm25()
124
+ self.vector_store = self._init_faiss()
125
+ self.vector_retriever = self.vector_store.as_retriever(search_kwargs={"k": 3})
126
+
127
+ def _init_bm25(self) -> BM25Retriever:
128
+ cache_key = f"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}"
129
+ if os.path.exists(cache_key):
130
+ logger.info("Loading cached BM25 index")
131
+ with open(cache_key, "rb") as f:
132
+ return pickle.load(f)
133
+
134
+ logger.info("Building new BM25 index")
135
+ retriever = BM25Retriever.from_documents(self.documents)
136
+ retriever.k = 5
137
+ with open(cache_key, "wb") as f:
138
+ pickle.dump(retriever, f)
139
+ return retriever
140
+
141
+ def _init_faiss(self) -> FAISS:
142
+ cache_key = f"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}"
143
+ if os.path.exists(cache_key):
144
+ logger.info("Loading cached FAISS index")
145
+ return FAISS.load_local(
146
+ cache_key,
147
+ MistralEmbeddings(),
148
+ allow_dangerous_deserialization=True
149
+ )
150
+
151
+ logger.info("Building new FAISS index")
152
+ vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())
153
+ vector_store.save_local(cache_key)
154
+ return vector_store
155
+
156
+ @lru_cache(maxsize=500)
157
+ def retrieve(self, query: str) -> str:
158
+ try:
159
+ processed_query = self._preprocess_query(query)
160
+ expanded_query = self._hyde_expansion(processed_query)
161
+
162
+ bm25_results = self.bm25.invoke(processed_query)
163
+ vector_results = self.vector_retriever.invoke(processed_query)
164
+ expanded_results = self.bm25.invoke(expanded_query)
165
+
166
+ fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])
167
+ return self._format_context(fused_results[:5])
168
+ except Exception as e:
169
+ logger.error(f"Retrieval Error: {str(e)}")
170
+ return ""
171
+
172
+ def _preprocess_query(self, query: str) -> str:
173
+ return query.lower().strip()
174
+
175
+ @lru_cache(maxsize=500)
176
+ def _hyde_expansion(self, query: str) -> str:
177
+ try:
178
+ response = gemini_client.models.generate_content( # Use Gemini client for HyDE
179
+ model=generation_model,
180
+ contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
181
+ )
182
+ return response.text
183
+ except Exception as e:
184
+ logger.error(f"HyDE Error: {str(e)}")
185
+ return query
186
+
187
+ def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:
188
+ fused_scores = defaultdict(float)
189
+ for docs in result_sets:
190
+ for rank, doc in enumerate(docs, 1):
191
+ fused_scores[doc.metadata["chunk_id"]] += 1 / (rank + 60)
192
+
193
+ seen = set()
194
+ return [
195
+ doc for doc in sorted(
196
+ (doc for docs in result_sets for doc in docs),
197
+ key=lambda x: fused_scores[x.metadata["chunk_id"]],
198
+ reverse=True
199
+ ) if not (doc.metadata["chunk_id"] in seen or seen.add(doc.metadata["chunk_id"]))
200
+ ]
201
+
202
+ def _format_context(self, docs: List[Document]) -> str:
203
+ context = []
204
+ for doc in docs:
205
+ context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
206
+ **Application**: {doc.metadata['application']}
207
+ **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
208
+ **Strategy Excerpt**:\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
209
+ context.append(context_str)
210
+ return "\n\n---\n\n".join(context)
211
+
212
+ # --- Generation System ---
213
+ SYSTEM_PROMPT = """**Biomimicry Expert Guidelines**
214
+ 1. Base answers strictly on context
215
+ 2. **Bold** technical terms
216
+ 3. Include reference links at the end of the response
217
+
218
+ Context: {context}"""
219
+
220
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
221
+ def get_ai_response(query: str, context: str) -> str:
222
+ try:
223
+ response = gemini_client.models.generate_content( # Use Gemini client for generation
224
+ model=generation_model,
225
+ contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
226
+ )
227
+ logger.info(f"Raw Response: {response.text}") # Log raw response
228
+ return _postprocess_response(response.text)
229
+ except Exception as e:
230
+ logger.error(f"Generation Error: {str(e)}")
231
+ return "I'm unable to generate a response right now. Please try again later."
232
+
233
+ def _postprocess_response(response: str) -> str:
234
+ response = re.sub(r"\[(.*?)\]", r"[\1](#)", response)
235
+ response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
236
+ return response
237
+
238
+ # --- Optimized Pipeline ---
239
+ documents = load_and_chunk_data(data_file_name)
240
+ retriever = EnhancedRetriever(documents)
241
+
242
+ def generate_response(question: str) -> str:
243
+ try:
244
+ context = retriever.retrieve(question)
245
+ return get_ai_response(question, context) if context else "No relevant information found."
246
+ except Exception as e:
247
+ logger.error(f"Pipeline Error: {str(e)}")
248
+ return "An error occurred processing your request."
249
+
250
+ # --- Gradio Interface ---
251
+ def chat_interface(question: str, history: List[Tuple[str, str]]):
252
+ response = generate_response(question)
253
+ return "", history + [(question, response)]
254
+
255
+ with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
256
+ gr.Markdown("# 🌿 AskNature RAG-based Chatbot ")
257
+ with gr.Row():
258
+ chatbot = gr.Chatbot(label="Dialogue History", height=500)
259
+ with gr.Row():
260
+ question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')",
261
+ label="Inquiry", scale=4)
262
+ clear_btn = gr.Button("Clear History", variant="secondary")
263
+
264
+ gr.Markdown("""
265
+ <div style="text-align: center; color: #4a7c59;">
266
+ <small>Powered by AskNature's Database |
267
+ Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
268
+ </div>""")
269
+ question.submit(chat_interface, [question, chatbot], [question, chatbot])
270
+ clear_btn.click(lambda: [], None, chatbot)
271
+
272
+ if __name__ == "__main__":
273
+ =======
274
+ # Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation
275
+
276
+ import json
277
+ import logging
278
+ import re
279
+ import os
280
+ import pickle
281
+ from typing import List, Tuple, Optional
282
+ import gradio as gr
283
+ from openai import OpenAI
284
+ from google import genai
285
+ from functools import lru_cache
286
+ from tenacity import retry, stop_after_attempt, wait_exponential
287
+ from langchain_community.retrievers import BM25Retriever
288
+ from langchain_community.vectorstores import FAISS
289
+ from langchain_core.embeddings import Embeddings
290
+ from langchain_core.documents import Document
291
+ from collections import defaultdict
292
+ import hashlib
293
+ from tqdm import tqdm
294
+
295
+ from dotenv import load_dotenv
296
+ load_dotenv()
297
+ # --- Configuration ---
298
+ FAISS_INDEX_PATH = "faiss_index"
299
+ BM25_INDEX_PATH = "bm25_index.pkl"
300
+ CACHE_VERSION = "v1" # Increment when data format changes
301
+ embedding_model = "e5-mistral-7b-instruct" # OpenAI embedding model
302
+ generation_model = "gemini-2.0-flash" # Gemini generation model
303
+ data_file_name = "AskNatureNet_data_enhanced.json"
304
+ API_CONFIG = {
305
+ "gemini_api_key": os.getenv("GEMINI_API_KEY") # Gemini API key for generation
306
+ }
307
+
308
+ CHUNK_SIZE = 800
309
+ OVERLAP = 200
310
+ EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls
311
+
312
+ # Initialize clients
313
+ OPENAI_API_CONFIG = {
314
+ "api_key": os.getenv("OPENAI_API_KEY"),
315
+ "base_url": "https://chat-ai.academiccloud.de/v1"
316
+ }
317
+ client = OpenAI(**OPENAI_API_CONFIG)
318
+ gemini_client = genai.Client(api_key=API_CONFIG["gemini_api_key"]) # Gemini client for generation
319
+ logging.basicConfig(level=logging.INFO)
320
+ logger = logging.getLogger(__name__)
321
+
322
+ # --- Helper Functions ---
323
+ def get_data_hash(file_path: str) -> str:
324
+ """Generate hash of data file for cache validation"""
325
+ with open(file_path, "rb") as f:
326
+ return hashlib.md5(f.read()).hexdigest()
327
+
328
+ # --- Custom Embedding Handler with Progress Tracking ---
329
+ class MistralEmbeddings(Embeddings):
330
+ """E5-Mistral-7B embedding adapter with error handling and progress tracking"""
331
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
332
+ embeddings = []
333
+ try:
334
+ # Process in batches with progress tracking
335
+ for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
336
+ batch = texts[i:i + EMBEDDING_BATCH_SIZE]
337
+ response = client.embeddings.create(
338
+ input=batch,
339
+ model=embedding_model,
340
+ encoding_format="float"
341
+ )
342
+ embeddings.extend([e.embedding for e in response.data])
343
+ return embeddings
344
+ except Exception as e:
345
+ logger.error(f"Embedding Error: {str(e)}")
346
+ return [[] for _ in texts]
347
+
348
+ def embed_query(self, text: str) -> List[float]:
349
+ return self.embed_documents([text])[0]
350
+
351
+ # --- Data Processing with Cache Validation ---
352
+ def load_and_chunk_data(file_path: str) -> List[Document]:
353
+ """Enhanced chunking with metadata preservation"""
354
+ current_hash = get_data_hash(file_path)
355
+ cache_file = f"documents_{CACHE_VERSION}_{current_hash}.pkl"
356
+
357
+ if os.path.exists(cache_file):
358
+ logger.info("Loading cached documents")
359
+ with open(cache_file, "rb") as f:
360
+ return pickle.load(f)
361
+
362
+ with open(file_path, 'r', encoding='utf-8') as f:
363
+ data = json.load(f)
364
+
365
+ documents = []
366
+ for item in tqdm(data, desc="Chunking Progress"):
367
+ base_content = f"""Source: {item['Source']}
368
+ Application: {item['Application']}
369
+ Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}
370
+ Technical Concepts: {', '.join(item['technical_concepts'])}
371
+ Biological Mechanisms: {', '.join(item['biological_mechanisms'])}"""
372
+
373
+ strategy = item['Strategy']
374
+ for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):
375
+ chunk = strategy[i:i + CHUNK_SIZE]
376
+ documents.append(Document(
377
+ page_content=f"{base_content}\nStrategy Excerpt:\n{chunk}",
378
+ metadata={
379
+ "source": item["Source"],
380
+ "application": item["Application"],
381
+ "technical_concepts": item["technical_concepts"],
382
+ "sustainability_impacts": item["sustainability_impacts"],
383
+ "hyperlink": item["Hyperlink"],
384
+ "chunk_id": f"{item['Source']}-{len(documents)+1}"
385
+ }
386
+ ))
387
+
388
+ with open(cache_file, "wb") as f:
389
+ pickle.dump(documents, f)
390
+ return documents
391
+
392
+ # --- Optimized Retrieval System ---
393
+ class EnhancedRetriever:
394
+ """Hybrid retriever with persistent caching"""
395
+ def __init__(self, documents: List[Document]):
396
+ self.documents = documents
397
+ self.bm25 = self._init_bm25()
398
+ self.vector_store = self._init_faiss()
399
+ self.vector_retriever = self.vector_store.as_retriever(search_kwargs={"k": 3})
400
+
401
+ def _init_bm25(self) -> BM25Retriever:
402
+ cache_key = f"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}"
403
+ if os.path.exists(cache_key):
404
+ logger.info("Loading cached BM25 index")
405
+ with open(cache_key, "rb") as f:
406
+ return pickle.load(f)
407
+
408
+ logger.info("Building new BM25 index")
409
+ retriever = BM25Retriever.from_documents(self.documents)
410
+ retriever.k = 5
411
+ with open(cache_key, "wb") as f:
412
+ pickle.dump(retriever, f)
413
+ return retriever
414
+
415
+ def _init_faiss(self) -> FAISS:
416
+ cache_key = f"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}"
417
+ if os.path.exists(cache_key):
418
+ logger.info("Loading cached FAISS index")
419
+ return FAISS.load_local(
420
+ cache_key,
421
+ MistralEmbeddings(),
422
+ allow_dangerous_deserialization=True
423
+ )
424
+
425
+ logger.info("Building new FAISS index")
426
+ vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())
427
+ vector_store.save_local(cache_key)
428
+ return vector_store
429
+
430
+ @lru_cache(maxsize=500)
431
+ def retrieve(self, query: str) -> str:
432
+ try:
433
+ processed_query = self._preprocess_query(query)
434
+ expanded_query = self._hyde_expansion(processed_query)
435
+
436
+ bm25_results = self.bm25.invoke(processed_query)
437
+ vector_results = self.vector_retriever.invoke(processed_query)
438
+ expanded_results = self.bm25.invoke(expanded_query)
439
+
440
+ fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])
441
+ return self._format_context(fused_results[:5])
442
+ except Exception as e:
443
+ logger.error(f"Retrieval Error: {str(e)}")
444
+ return ""
445
+
446
+ def _preprocess_query(self, query: str) -> str:
447
+ return query.lower().strip()
448
+
449
+ @lru_cache(maxsize=500)
450
+ def _hyde_expansion(self, query: str) -> str:
451
+ try:
452
+ response = gemini_client.models.generate_content( # Use Gemini client for HyDE
453
+ model=generation_model,
454
+ contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
455
+ )
456
+ return response.text
457
+ except Exception as e:
458
+ logger.error(f"HyDE Error: {str(e)}")
459
+ return query
460
+
461
+ def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:
462
+ fused_scores = defaultdict(float)
463
+ for docs in result_sets:
464
+ for rank, doc in enumerate(docs, 1):
465
+ fused_scores[doc.metadata["chunk_id"]] += 1 / (rank + 60)
466
+
467
+ seen = set()
468
+ return [
469
+ doc for doc in sorted(
470
+ (doc for docs in result_sets for doc in docs),
471
+ key=lambda x: fused_scores[x.metadata["chunk_id"]],
472
+ reverse=True
473
+ ) if not (doc.metadata["chunk_id"] in seen or seen.add(doc.metadata["chunk_id"]))
474
+ ]
475
+
476
+ def _format_context(self, docs: List[Document]) -> str:
477
+ context = []
478
+ for doc in docs:
479
+ context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
480
+ **Application**: {doc.metadata['application']}
481
+ **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
482
+ **Strategy Excerpt**:\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
483
+ context.append(context_str)
484
+ return "\n\n---\n\n".join(context)
485
+
486
+ # --- Generation System ---
487
+ SYSTEM_PROMPT = """**Biomimicry Expert Guidelines**
488
+ 1. Base answers strictly on context
489
+ 2. **Bold** technical terms
490
+ 3. Include reference links at the end of the response
491
+
492
+ Context: {context}"""
493
+
494
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
495
+ def get_ai_response(query: str, context: str) -> str:
496
+ try:
497
+ response = gemini_client.models.generate_content( # Use Gemini client for generation
498
+ model=generation_model,
499
+ contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
500
+ )
501
+ logger.info(f"Raw Response: {response.text}") # Log raw response
502
+ return _postprocess_response(response.text)
503
+ except Exception as e:
504
+ logger.error(f"Generation Error: {str(e)}")
505
+ return "I'm unable to generate a response right now. Please try again later."
506
+
507
+ def _postprocess_response(response: str) -> str:
508
+ response = re.sub(r"\[(.*?)\]", r"[\1](#)", response)
509
+ response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
510
+ return response
511
+
512
+ # --- Optimized Pipeline ---
513
+ documents = load_and_chunk_data(data_file_name)
514
+ retriever = EnhancedRetriever(documents)
515
+
516
+ def generate_response(question: str) -> str:
517
+ try:
518
+ context = retriever.retrieve(question)
519
+ return get_ai_response(question, context) if context else "No relevant information found."
520
+ except Exception as e:
521
+ logger.error(f"Pipeline Error: {str(e)}")
522
+ return "An error occurred processing your request."
523
+
524
+ # --- Gradio Interface ---
525
+ def chat_interface(question: str, history: List[Tuple[str, str]]):
526
+ response = generate_response(question)
527
+ return "", history + [(question, response)]
528
+
529
+ with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
530
+ gr.Markdown("# 🌿 AskNature RAG-based Chatbot ")
531
+ with gr.Row():
532
+ chatbot = gr.Chatbot(label="Dialogue History", height=500)
533
+ with gr.Row():
534
+ question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')",
535
+ label="Inquiry", scale=4)
536
+ clear_btn = gr.Button("Clear History", variant="secondary")
537
+
538
+ gr.Markdown("""
539
+ <div style="text-align: center; color: #4a7c59;">
540
+ <small>Powered by AskNature's Database |
541
+ Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
542
+ </div>""")
543
+ question.submit(chat_interface, [question, chatbot], [question, chatbot])
544
+ clear_btn.click(lambda: [], None, chatbot)
545
+
546
+ if __name__ == "__main__":
547
+ >>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
548
+ demo.launch(show_error=True)
bm25_index.pkl_5c0c37d3cbc20e235eeec7cffd2d312f ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d29d4cd2651f2f356e67f24dafbb804293116be434bef7ec4f43b2f5afa456
3
+ size 13737932
documents_v1_5c0c37d3cbc20e235eeec7cffd2d312f.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abddd30a2c4716bc6b06e7db60860017cd80838633bfa437dfa16f8d0e322817
3
+ size 6358288
main.ipynb ADDED
@@ -0,0 +1,1250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# Approach 1: Local Llama2 via Ollama\n",
10
+ "\n",
11
+ "questions = [\n",
12
+ " \"How do coral proteins help make eco-friendly fabrics without dyes?\",\n",
13
+ " \"What environmental problems do coral-inspired textiles solve?\",\n",
14
+ " \"What is industrial symbiosis and how does the Kalundborg example work?\",\n",
15
+ " \"How do Metavision sensors work like human eyes to save energy?\",\n",
16
+ " \"How does TISSIUM copy skin proteins for medical adhesives?\",\n",
17
+ " \"How does DNA-level design create better fibers inspired by nature?\",\n",
18
+ " \"Why is industrial symbiosis hard to implement despite benefits?\",\n",
19
+ " \"How can biological systems inspire sustainable manufacturing?\",\n",
20
+ " \"What other industries can use protein-based materials like Werewool?\",\n",
21
+ " \"How could event-based cameras improve security systems?\",\n",
22
+ " \"Design a factory network that works like coral reef partnerships - what features would it need?\"\n",
23
+ "]\n",
24
+ "\n",
25
+ "\n",
26
+ "import json\n",
27
+ "import pandas as pd\n",
28
+ "from langchain_ollama import OllamaLLM, OllamaEmbeddings\n",
29
+ "from langchain_community.vectorstores import FAISS\n",
30
+ "from langchain_core.prompts import PromptTemplate\n",
31
+ "from langchain_core.output_parsers import StrOutputParser\n",
32
+ "from operator import itemgetter\n",
33
+ "import gradio as gr\n",
34
+ "\n",
35
+ "# Load and process data\n",
36
+ "with open('mini_data.json', 'r', encoding='utf-8') as f:\n",
37
+ " data = json.load(f)\n",
38
+ "documents = [f\"Source: {item['Source']}\\nApplication: {item['Application']}\\nFunction1: {item['Function1']}\\nStrategy: {item['Strategy']}\" for item in data]\n",
39
+ "\n",
40
+ "# Local Llama2 setup\n",
41
+ "local_model = OllamaLLM(model=\"llama2\")\n",
42
+ "local_embeddings = OllamaEmbeddings(model=\"llama2\")\n",
43
+ "vectorstore = FAISS.from_texts(documents, local_embeddings)\n",
44
+ "retriever = vectorstore.as_retriever()\n",
45
+ "\n",
46
+ "# RAG pipeline\n",
47
+ "template = \"\"\"Answer the question based on the context below. If unsure, reply \"I don't know\".\n",
48
+ "Context: {context}\n",
49
+ "Question: {question}\"\"\"\n",
50
+ "prompt = PromptTemplate.from_template(template)\n",
51
+ "local_chain = ({\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")} \n",
52
+ " | prompt | local_model | StrOutputParser())\n",
53
+ "\n",
54
+ "# Chat interface\n",
55
+ "def local_rag(question, history):\n",
56
+ " response = local_chain.invoke({\"question\": question})\n",
57
+ " history.append((question, response))\n",
58
+ " return \"\", history\n",
59
+ "\n",
60
+ "with gr.Blocks() as local_demo:\n",
61
+ " gr.Markdown(\"# Local Llama2 RAG Chatbot\")\n",
62
+ " chatbot = gr.Chatbot()\n",
63
+ " question = gr.Textbox(label=\"Ask about biomimicry:\")\n",
64
+ " question.submit(local_rag, [question, chatbot], [question, chatbot])\n",
65
+ " \n",
66
+ "local_demo.launch()"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "# Approach 2: Llama3.3 via API\n",
76
+ "import json\n",
77
+ "import gradio as gr\n",
78
+ "from openai import OpenAI\n",
79
+ "from operator import itemgetter\n",
80
+ "\n",
81
+ "# API configuration\n",
82
+ "api_key = 'd9960fad1d2aaa16167902b0d26e369f'\n",
83
+ "base_url = \"https://chat-ai.academiccloud.de/v1\"\n",
84
+ "model = \"llama-3.3-70b-instruct\"\n",
85
+ "\n",
86
+ "# Initialize OpenAI client\n",
87
+ "client = OpenAI(api_key=api_key, base_url=base_url)\n",
88
+ "\n",
89
+ "# Load and process data\n",
90
+ "with open('mini_data.json', 'r', encoding='utf-8') as f:\n",
91
+ " data = json.load(f)\n",
92
+ "documents = [f\"Source: {item['Source']}\\nApplication: {item['Application']}\\nFunction1: {item['Function1']}\\nStrategy: {item['Strategy']}\" for item in data]\n",
93
+ "\n",
94
+ "def retrieve_context(question):\n",
95
+ " \"\"\"Simple keyword-based retrieval since embeddings aren't available\"\"\"\n",
96
+ " keywords = set(question.lower().split())\n",
97
+ " relevant = []\n",
98
+ " for doc in documents:\n",
99
+ " if any(keyword in doc.lower() for keyword in keywords):\n",
100
+ " relevant.append(doc)\n",
101
+ " return \"\\n\\n\".join(relevant[:3]) # Return top 3 matches\n",
102
+ "\n",
103
+ "def generate_response(question):\n",
104
+ " context = retrieve_context(question)\n",
105
+ " response = client.chat.completions.create(\n",
106
+ " messages=[\n",
107
+ " {\"role\": \"system\", \"content\": f\"Answer based on context. If unsure, say 'I don't know'.\\nContext: {context}\"},\n",
108
+ " {\"role\": \"user\", \"content\": question}\n",
109
+ " ],\n",
110
+ " model=model\n",
111
+ " )\n",
112
+ " return response.choices[0].message.content\n",
113
+ "\n",
114
+ "# Chat interface\n",
115
+ "def cloud_rag(question, history):\n",
116
+ " response = generate_response(question)\n",
117
+ " history.append((question, response))\n",
118
+ " return \"\", history\n",
119
+ "\n",
120
+ "with gr.Blocks() as demo:\n",
121
+ " gr.Markdown(\"# AskNature RAG-based Chatbot\")\n",
122
+ " chatbot = gr.Chatbot()\n",
123
+ " question = gr.Textbox(label=\"Ask about biomimicry:\")\n",
124
+ " question.submit(cloud_rag, [question, chatbot], [question, chatbot])\n",
125
+ " \n",
126
+ "demo.launch()"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": []
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "# Enhanced Metadata Generation with Rate Control and Incremental Processing\n",
143
+ "import json\n",
144
+ "import time\n",
145
+ "import random\n",
146
+ "from typing import Dict, List\n",
147
+ "from openai import OpenAI\n",
148
+ "from tenacity import retry, stop_after_attempt, wait_random_exponential\n",
149
+ "import os\n",
150
+ "\n",
151
+ "# Initialize OpenAI client\n",
152
+ "client = OpenAI(\n",
153
+ " api_key= 'd9960fad1d2aaa16167902b0d26e369f', # 'd1c9ed1ca70b9721dee1087d93f9662a',\n",
154
+ " base_url=\"https://chat-ai.academiccloud.de/v1\"\n",
155
+ ")\n",
156
+ "\n",
157
+ "@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(min=2, max=60))\n",
158
+ "def generate_metadata_tags(strategy_text: str) -> Dict:\n",
159
+ " \"\"\"Generate structured metadata with enhanced error handling\"\"\"\n",
160
+ " system_prompt = \"\"\"Analyze the technical text and generate structured metadata:\n",
161
+ "1. **Technical Concepts** (array, max 5 items): Specific technical terms/methods\n",
162
+ "2. **Biological Mechanisms** (array, max 3): Biological processes observed in nature\n",
163
+ "3. **Industry Applications** (array, max 3): Practical commercial uses\n",
164
+ "4. **Sustainability Impacts** (array, max 2): Environmental benefits\n",
165
+ "\n",
166
+ "Example Response:\n",
167
+ "{\n",
168
+ " \"technical_concepts\": [\"protein-based pigmentation\", \"DNA-level fiber design\"],\n",
169
+ " \"biological_mechanisms\": [\"coral-algae symbiosis\"],\n",
170
+ " \"industry_applications\": [\"textile manufacturing\"],\n",
171
+ " \"sustainability_impacts\": [\"reduces chemical waste\"]\n",
172
+ "}\"\"\"\n",
173
+ "\n",
174
+ " response = client.chat.completions.create(\n",
175
+ " messages=[\n",
176
+ " {\"role\": \"system\", \"content\": system_prompt},\n",
177
+ " {\"role\": \"user\", \"content\": strategy_text}\n",
178
+ " ],\n",
179
+ " model=\"llama-3.3-70b-instruct\",\n",
180
+ " temperature=0.1,\n",
181
+ " response_format={\"type\": \"json_object\"}\n",
182
+ " )\n",
183
+ " \n",
184
+ " return validate_metadata(json.loads(response.choices[0].message.content))\n",
185
+ "\n",
186
+ "def validate_metadata(metadata: Dict) -> Dict:\n",
187
+ " \"\"\"Ensure metadata structure quality\"\"\"\n",
188
+ " required_keys = {\n",
189
+ " \"technical_concepts\": list,\n",
190
+ " \"biological_mechanisms\": list,\n",
191
+ " \"industry_applications\": list,\n",
192
+ " \"sustainability_impacts\": list\n",
193
+ " }\n",
194
+ " \n",
195
+ " for key, type_ in required_keys.items():\n",
196
+ " if key not in metadata or not isinstance(metadata[key], type_):\n",
197
+ " raise ValueError(f\"Invalid metadata format for {key}\")\n",
198
+ " \n",
199
+ " return metadata\n",
200
+ "\n",
201
+ "def enhance_dataset(input_file: str, output_file: str):\n",
202
+ " \"\"\"Robust incremental metadata enhancement with rate control\"\"\"\n",
203
+ " # Load existing enhanced data\n",
204
+ " existing_data = []\n",
205
+ " existing_hyperlinks = set()\n",
206
+ " \n",
207
+ " if os.path.exists(output_file):\n",
208
+ " with open(output_file, 'r') as f:\n",
209
+ " existing_data = json.load(f)\n",
210
+ " existing_hyperlinks = {item[\"Hyperlink\"] for item in existing_data if \"Hyperlink\" in item}\n",
211
+ " \n",
212
+ " # Load input data and filter unprocessed items\n",
213
+ " with open(input_file, 'r') as f:\n",
214
+ " input_data = json.load(f)\n",
215
+ " \n",
216
+ " new_items = [item for item in input_data if item.get(\"Hyperlink\") not in existing_hyperlinks]\n",
217
+ " \n",
218
+ " if not new_items:\n",
219
+ " print(\"All items already processed in the enhanced file.\")\n",
220
+ " return\n",
221
+ " else:\n",
222
+ " output_length = len(existing_data)\n",
223
+ " input_length = len(input_data)\n",
224
+ " print(f\"Processing {len(new_items)} new items... out of {input_length} total\")\n",
225
+ " \n",
226
+ " results = existing_data.copy()\n",
227
+ " error_count = 0\n",
228
+ " total_items = len(new_items)\n",
229
+ " \n",
230
+ " for idx, item in enumerate(new_items):\n",
231
+ " try:\n",
232
+ " # Enhanced rate control with progressive backoff\n",
233
+ " if idx > 0:\n",
234
+ " base_delay = min(5 + (idx // 10), 30) # Progressive delay up to 30s\n",
235
+ " delay = random.uniform(base_delay, base_delay + 5)\n",
236
+ " time.sleep(delay)\n",
237
+ " \n",
238
+ " # Process item\n",
239
+ " metadata = generate_metadata_tags(item[\"Strategy\"])\n",
240
+ " enhanced_item = {**item, **metadata}\n",
241
+ " results.append(enhanced_item)\n",
242
+ " \n",
243
+ " # Checkpoint saving\n",
244
+ " if (idx + 1) % 5 == 0 or (idx + 1) == total_items:\n",
245
+ " with open(output_file, 'w') as f:\n",
246
+ " json.dump(results, f, indent=2)\n",
247
+ " print(f\"Progress: {idx+1+output_length}/{input_length} items processed\")\n",
248
+ " \n",
249
+ " except Exception as e:\n",
250
+ " error_count += 1\n",
251
+ " print(f\"Error processing {item.get('Source', 'Unknown')}: {str(e)}\")\n",
252
+ " # results.append(item) # Preserve original data\n",
253
+ " \n",
254
+ " print(f\"Processing complete. Success rate: {total_items-error_count}/{input_length}\")\n",
255
+ "\n",
256
+ "# Execute enhancement\n",
257
+ "enhance_dataset(\"AskNatureNet_data.json\", \"AskNatureNet_data_enhanced.json\")"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": [
266
+ "# Optimized RAG System with E5-Mistral Embeddings and Llama3-70B Generation\n",
267
+ " \n",
268
+ "import json\n",
269
+ "import logging\n",
270
+ "import re\n",
271
+ "import os\n",
272
+ "import pickle\n",
273
+ "from typing import List, Tuple, Optional\n",
274
+ "import gradio as gr\n",
275
+ "from openai import OpenAI\n",
276
+ "from functools import lru_cache\n",
277
+ "from tenacity import retry, stop_after_attempt, wait_exponential\n",
278
+ "from langchain_community.retrievers import BM25Retriever\n",
279
+ "from langchain_community.vectorstores import FAISS\n",
280
+ "from langchain_core.embeddings import Embeddings\n",
281
+ "from langchain_core.documents import Document\n",
282
+ "from collections import defaultdict\n",
283
+ "import hashlib\n",
284
+ "from tqdm import tqdm # For progress tracking\n",
285
+ "from dotenv import load_dotenv\n",
286
+ "load_dotenv()\n",
287
+ "\n",
288
+ "# --- Configuration ---\n",
289
+ "FAISS_INDEX_PATH = \"faiss_index\"\n",
290
+ "BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
291
+ "CACHE_VERSION = \"v1\" # Increment when data format changes\n",
292
+ "embedding_model = \"e5-mistral-7b-instruct\"\n",
293
+ "generation_model = \"meta-llama-3-70b-instruct\"\n",
294
+ "data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
295
+ "API_CONFIG = {\n",
296
+ " \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
297
+ " \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
298
+ "}\n",
299
+ "CHUNK_SIZE = 800\n",
300
+ "OVERLAP = 200\n",
301
+ "EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls\n",
302
+ "\n",
303
+ "# Initialize clients\n",
304
+ "client = OpenAI(**API_CONFIG)\n",
305
+ "logging.basicConfig(level=logging.INFO)\n",
306
+ "logger = logging.getLogger(__name__)\n",
307
+ "\n",
308
+ "# --- Helper Functions ---\n",
309
+ "def get_data_hash(file_path: str) -> str:\n",
310
+ " \"\"\"Generate hash of data file for cache validation\"\"\"\n",
311
+ " with open(file_path, \"rb\") as f:\n",
312
+ " return hashlib.md5(f.read()).hexdigest()\n",
313
+ "\n",
314
+ "# --- Custom Embedding Handler with Progress Tracking ---\n",
315
+ "class MistralEmbeddings(Embeddings):\n",
316
+ " \"\"\"E5-Mistral-7B embedding adapter with error handling and progress tracking\"\"\"\n",
317
+ " def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
318
+ " embeddings = []\n",
319
+ " try:\n",
320
+ " # Process in batches with progress tracking\n",
321
+ " for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
322
+ " batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
323
+ " response = client.embeddings.create(\n",
324
+ " input=batch,\n",
325
+ " model=embedding_model,\n",
326
+ " encoding_format=\"float\"\n",
327
+ " )\n",
328
+ " embeddings.extend([e.embedding for e in response.data])\n",
329
+ " return embeddings\n",
330
+ " except Exception as e:\n",
331
+ " logger.error(f\"Embedding Error: {str(e)}\")\n",
332
+ " return [[] for _ in texts]\n",
333
+ "\n",
334
+ " def embed_query(self, text: str) -> List[float]:\n",
335
+ " return self.embed_documents([text])[0]\n",
336
+ "\n",
337
+ "# --- Data Processing with Cache Validation ---\n",
338
+ "def load_and_chunk_data(file_path: str) -> List[Document]:\n",
339
+ " \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
340
+ " current_hash = get_data_hash(file_path)\n",
341
+ " cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
342
+ " \n",
343
+ " if os.path.exists(cache_file):\n",
344
+ " logger.info(\"Loading cached documents\")\n",
345
+ " with open(cache_file, \"rb\") as f:\n",
346
+ " return pickle.load(f)\n",
347
+ " \n",
348
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
349
+ " data = json.load(f)\n",
350
+ " \n",
351
+ " documents = []\n",
352
+ " for item in tqdm(data, desc=\"Chunking Progress\"):\n",
353
+ " base_content = f\"\"\"Source: {item['Source']}\n",
354
+ "Application: {item['Application']}\n",
355
+ "Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
356
+ "Technical Concepts: {', '.join(item['technical_concepts'])}\n",
357
+ "Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
358
+ " \n",
359
+ " strategy = item['Strategy']\n",
360
+ " for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
361
+ " chunk = strategy[i:i + CHUNK_SIZE]\n",
362
+ " documents.append(Document(\n",
363
+ " page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
364
+ " metadata={\n",
365
+ " \"source\": item[\"Source\"],\n",
366
+ " \"application\": item[\"Application\"],\n",
367
+ " \"technical_concepts\": item[\"technical_concepts\"],\n",
368
+ " \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
369
+ " \"hyperlink\": item[\"Hyperlink\"],\n",
370
+ " \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
371
+ " }\n",
372
+ " ))\n",
373
+ " \n",
374
+ " with open(cache_file, \"wb\") as f:\n",
375
+ " pickle.dump(documents, f)\n",
376
+ " return documents\n",
377
+ "\n",
378
+ "# --- Optimized Retrieval System ---\n",
379
+ "class EnhancedRetriever:\n",
380
+ " \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
381
+ " def __init__(self, documents: List[Document]):\n",
382
+ " self.documents = documents\n",
383
+ " self.bm25 = self._init_bm25()\n",
384
+ " self.vector_store = self._init_faiss()\n",
385
+ " self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
386
+ "\n",
387
+ " def _init_bm25(self) -> BM25Retriever:\n",
388
+ " cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
389
+ " if os.path.exists(cache_key):\n",
390
+ " logger.info(\"Loading cached BM25 index\")\n",
391
+ " with open(cache_key, \"rb\") as f:\n",
392
+ " return pickle.load(f)\n",
393
+ " \n",
394
+ " logger.info(\"Building new BM25 index\")\n",
395
+ " retriever = BM25Retriever.from_documents(self.documents)\n",
396
+ " retriever.k = 5\n",
397
+ " with open(cache_key, \"wb\") as f:\n",
398
+ " pickle.dump(retriever, f)\n",
399
+ " return retriever\n",
400
+ "\n",
401
+ " def _init_faiss(self) -> FAISS:\n",
402
+ " cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
403
+ " if os.path.exists(cache_key):\n",
404
+ " logger.info(\"Loading cached FAISS index\")\n",
405
+ " return FAISS.load_local(\n",
406
+ " cache_key,\n",
407
+ " MistralEmbeddings(),\n",
408
+ " allow_dangerous_deserialization=True\n",
409
+ " )\n",
410
+ " \n",
411
+ " logger.info(\"Building new FAISS index\")\n",
412
+ " vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
413
+ " vector_store.save_local(cache_key)\n",
414
+ " return vector_store\n",
415
+ "\n",
416
+ " @lru_cache(maxsize=500)\n",
417
+ " def retrieve(self, query: str) -> str:\n",
418
+ " try:\n",
419
+ " processed_query = self._preprocess_query(query)\n",
420
+ " expanded_query = self._hyde_expansion(processed_query)\n",
421
+ " \n",
422
+ " bm25_results = self.bm25.invoke(processed_query)\n",
423
+ " vector_results = self.vector_retriever.invoke(processed_query)\n",
424
+ " expanded_results = self.bm25.invoke(expanded_query)\n",
425
+ " \n",
426
+ " fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
427
+ " return self._format_context(fused_results[:5])\n",
428
+ " except Exception as e:\n",
429
+ " logger.error(f\"Retrieval Error: {str(e)}\")\n",
430
+ " return \"\"\n",
431
+ "\n",
432
+ " def _preprocess_query(self, query: str) -> str:\n",
433
+ " return query.lower().strip()\n",
434
+ "\n",
435
+ " @lru_cache(maxsize=500)\n",
436
+ " def _hyde_expansion(self, query: str) -> str:\n",
437
+ " try:\n",
438
+ " response = client.chat.completions.create(\n",
439
+ " model=generation_model,\n",
440
+ " messages=[{\n",
441
+ " \"role\": \"user\",\n",
442
+ " \"content\": f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
443
+ " }],\n",
444
+ " temperature=0.5,\n",
445
+ " max_tokens=200\n",
446
+ " )\n",
447
+ " return response.choices[0].message.content\n",
448
+ " except Exception as e:\n",
449
+ " logger.error(f\"HyDE Error: {str(e)}\")\n",
450
+ " return query\n",
451
+ "\n",
452
+ " def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
453
+ " fused_scores = defaultdict(float)\n",
454
+ " for docs in result_sets:\n",
455
+ " for rank, doc in enumerate(docs, 1):\n",
456
+ " fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
457
+ " \n",
458
+ " seen = set()\n",
459
+ " return [\n",
460
+ " doc for doc in sorted(\n",
461
+ " (doc for docs in result_sets for doc in docs),\n",
462
+ " key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
463
+ " reverse=True\n",
464
+ " ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
465
+ " ]\n",
466
+ "\n",
467
+ " def _format_context(self, docs: List[Document]) -> str:\n",
468
+ " context = []\n",
469
+ " for doc in docs:\n",
470
+ " context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
471
+ " **Application**: {doc.metadata['application']}\n",
472
+ " **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
473
+ " **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
474
+ " context.append(context_str)\n",
475
+ " return \"\\n\\n---\\n\\n\".join(context)\n",
476
+ "\n",
477
+ "# --- Generation System ---\n",
478
+ "SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
479
+ "1. Base answers strictly on context\n",
480
+ "2. Cite sources as [Source]\n",
481
+ "3. **Bold** technical terms\n",
482
+ "4. Include reference links\n",
483
+ "\n",
484
+ "Context: {context}\"\"\"\n",
485
+ "\n",
486
+ "@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
487
+ "def get_ai_response(query: str, context: str) -> str:\n",
488
+ " try:\n",
489
+ " response = client.chat.completions.create(\n",
490
+ " model=generation_model,\n",
491
+ " messages=[\n",
492
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT.format(context=context)},\n",
493
+ " {\"role\": \"user\", \"content\": f\"Question: {query}\\nProvide a detailed technical answer:\"}\n",
494
+ " ],\n",
495
+ " temperature=0.4,\n",
496
+ " max_tokens=2000 # Increased max_tokens\n",
497
+ " )\n",
498
+ " logger.info(f\"Raw Response: {response.choices[0].message.content}\") # Log raw response\n",
499
+ " return _postprocess_response(response.choices[0].message.content)\n",
500
+ " except Exception as e:\n",
501
+ " logger.error(f\"Generation Error: {str(e)}\")\n",
502
+ " return \"I'm unable to generate a response right now. Please try again later.\"\n",
503
+ "\n",
504
+ "def _postprocess_response(response: str) -> str:\n",
505
+ " response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
506
+ " response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
507
+ " return response\n",
508
+ "\n",
509
+ "# --- Optimized Pipeline ---\n",
510
+ "documents = load_and_chunk_data(data_file_name)\n",
511
+ "retriever = EnhancedRetriever(documents)\n",
512
+ "\n",
513
+ "def generate_response(question: str) -> str:\n",
514
+ " try:\n",
515
+ " context = retriever.retrieve(question)\n",
516
+ " return get_ai_response(question, context) if context else \"No relevant information found.\"\n",
517
+ " except Exception as e:\n",
518
+ " logger.error(f\"Pipeline Error: {str(e)}\")\n",
519
+ " return \"An error occurred processing your request.\"\n",
520
+ "\n",
521
+ "# --- Gradio Interface ---\n",
522
+ "def chat_interface(question: str, history: List[Tuple[str, str]]):\n",
523
+ " response = generate_response(question)\n",
524
+ " return \"\", history + [(question, response)]\n",
525
+ "\n",
526
+ "with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
527
+ " gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
528
+ " with gr.Row():\n",
529
+ " chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
530
+ " with gr.Row():\n",
531
+ " question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
532
+ " label=\"Inquiry\", scale=4)\n",
533
+ " clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
534
+ " \n",
535
+ " gr.Markdown(\"\"\"\n",
536
+ " <div style=\"text-align: center; color: #4a7c59;\">\n",
537
+ " <small>Powered by AskNature's Database | \n",
538
+ " Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
539
+ " </div>\"\"\")\n",
540
+ " question.submit(chat_interface, [question, chatbot], [question, chatbot])\n",
541
+ " clear_btn.click(lambda: [], None, chatbot)\n",
542
+ "\n",
543
+ "if __name__ == \"__main__\":\n",
544
+ " demo.launch(show_error=True)"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": null,
550
+ "metadata": {},
551
+ "outputs": [],
552
+ "source": [
553
+ "from dotenv import load_dotenv\n",
554
+ "import os\n",
555
+ "load_dotenv()\n",
556
+ "print(os.getenv(\"API_KEY\"))"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "code",
561
+ "execution_count": null,
562
+ "metadata": {},
563
+ "outputs": [],
564
+ "source": []
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": null,
569
+ "metadata": {},
570
+ "outputs": [],
571
+ "source": []
572
+ },
573
+ {
574
+ "cell_type": "code",
575
+ "execution_count": null,
576
+ "metadata": {},
577
+ "outputs": [],
578
+ "source": []
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": null,
583
+ "metadata": {},
584
+ "outputs": [],
585
+ "source": []
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "execution_count": null,
590
+ "metadata": {},
591
+ "outputs": [],
592
+ "source": []
593
+ },
594
+ {
595
+ "cell_type": "code",
596
+ "execution_count": null,
597
+ "metadata": {},
598
+ "outputs": [],
599
+ "source": [
600
+ "# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation\n",
601
+ "\n",
602
+ "import json\n",
603
+ "import logging\n",
604
+ "import re\n",
605
+ "import os\n",
606
+ "import pickle\n",
607
+ "from typing import List, Tuple, Optional\n",
608
+ "import gradio as gr\n",
609
+ "from openai import OpenAI \n",
610
+ "from google import genai \n",
611
+ "from functools import lru_cache\n",
612
+ "from tenacity import retry, stop_after_attempt, wait_exponential\n",
613
+ "from langchain_community.retrievers import BM25Retriever\n",
614
+ "from langchain_community.vectorstores import FAISS\n",
615
+ "from langchain_core.embeddings import Embeddings\n",
616
+ "from langchain_core.documents import Document\n",
617
+ "from collections import defaultdict\n",
618
+ "import hashlib\n",
619
+ "from tqdm import tqdm \n",
620
+ "\n",
621
+ "from dotenv import load_dotenv\n",
622
+ "load_dotenv()\n",
623
+ "# --- Configuration ---\n",
624
+ "FAISS_INDEX_PATH = \"faiss_index\"\n",
625
+ "BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
626
+ "CACHE_VERSION = \"v1\" # Increment when data format changes\n",
627
+ "embedding_model = \"e5-mistral-7b-instruct\" # OpenAI embedding model\n",
628
+ "generation_model = \"gemini-2.0-flash\" # Gemini generation model\n",
629
+ "data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
630
+ "API_CONFIG = {\n",
631
+ " \"gemini_api_key\": os.getenv(\"GEMINI_API_KEY\") # Gemini API key for generation\n",
632
+ "}\n",
633
+ "\n",
634
+ "CHUNK_SIZE = 800\n",
635
+ "OVERLAP = 200\n",
636
+ "EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls\n",
637
+ "\n",
638
+ "# Initialize clients\n",
639
+ "OPENAI_API_CONFIG = {\n",
640
+ " \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
641
+ " \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
642
+ "}\n",
643
+ "client = OpenAI(**OPENAI_API_CONFIG)\n",
644
+ "gemini_client = genai.Client(api_key=API_CONFIG[\"gemini_api_key\"]) # Gemini client for generation\n",
645
+ "logging.basicConfig(level=logging.INFO)\n",
646
+ "logger = logging.getLogger(__name__)\n",
647
+ "\n",
648
+ "# --- Helper Functions ---\n",
649
+ "def get_data_hash(file_path: str) -> str:\n",
650
+ " \"\"\"Generate hash of data file for cache validation\"\"\"\n",
651
+ " with open(file_path, \"rb\") as f:\n",
652
+ " return hashlib.md5(f.read()).hexdigest()\n",
653
+ "\n",
654
+ "# --- Custom Embedding Handler with Progress Tracking ---\n",
655
+ "class MistralEmbeddings(Embeddings):\n",
656
+ " \"\"\"E5-Mistral-7B embedding adapter with error handling and progress tracking\"\"\"\n",
657
+ " def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
658
+ " embeddings = []\n",
659
+ " try:\n",
660
+ " # Process in batches with progress tracking\n",
661
+ " for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
662
+ " batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
663
+ " response = client.embeddings.create(\n",
664
+ " input=batch,\n",
665
+ " model=embedding_model,\n",
666
+ " encoding_format=\"float\"\n",
667
+ " )\n",
668
+ " embeddings.extend([e.embedding for e in response.data])\n",
669
+ " return embeddings\n",
670
+ " except Exception as e:\n",
671
+ " logger.error(f\"Embedding Error: {str(e)}\")\n",
672
+ " return [[] for _ in texts]\n",
673
+ " \n",
674
+ " def embed_query(self, text: str) -> List[float]:\n",
675
+ " return self.embed_documents([text])[0]\n",
676
+ "\n",
677
+ "# --- Data Processing with Cache Validation ---\n",
678
+ "def load_and_chunk_data(file_path: str) -> List[Document]:\n",
679
+ " \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
680
+ " current_hash = get_data_hash(file_path)\n",
681
+ " cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
682
+ " \n",
683
+ " if os.path.exists(cache_file):\n",
684
+ " logger.info(\"Loading cached documents\")\n",
685
+ " with open(cache_file, \"rb\") as f:\n",
686
+ " return pickle.load(f)\n",
687
+ " \n",
688
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
689
+ " data = json.load(f)\n",
690
+ " \n",
691
+ " documents = []\n",
692
+ " for item in tqdm(data, desc=\"Chunking Progress\"):\n",
693
+ " base_content = f\"\"\"Source: {item['Source']}\n",
694
+ "Application: {item['Application']}\n",
695
+ "Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
696
+ "Technical Concepts: {', '.join(item['technical_concepts'])}\n",
697
+ "Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
698
+ " \n",
699
+ " strategy = item['Strategy']\n",
700
+ " for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
701
+ " chunk = strategy[i:i + CHUNK_SIZE]\n",
702
+ " documents.append(Document(\n",
703
+ " page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
704
+ " metadata={\n",
705
+ " \"source\": item[\"Source\"],\n",
706
+ " \"application\": item[\"Application\"],\n",
707
+ " \"technical_concepts\": item[\"technical_concepts\"],\n",
708
+ " \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
709
+ " \"hyperlink\": item[\"Hyperlink\"],\n",
710
+ " \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
711
+ " }\n",
712
+ " ))\n",
713
+ " \n",
714
+ " with open(cache_file, \"wb\") as f:\n",
715
+ " pickle.dump(documents, f)\n",
716
+ " return documents\n",
717
+ "\n",
718
+ "# --- Optimized Retrieval System ---\n",
719
+ "class EnhancedRetriever:\n",
720
+ " \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
721
+ " def __init__(self, documents: List[Document]):\n",
722
+ " self.documents = documents\n",
723
+ " self.bm25 = self._init_bm25()\n",
724
+ " self.vector_store = self._init_faiss()\n",
725
+ " self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
726
+ "\n",
727
+ " def _init_bm25(self) -> BM25Retriever:\n",
728
+ " cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
729
+ " if os.path.exists(cache_key):\n",
730
+ " logger.info(\"Loading cached BM25 index\")\n",
731
+ " with open(cache_key, \"rb\") as f:\n",
732
+ " return pickle.load(f)\n",
733
+ " \n",
734
+ " logger.info(\"Building new BM25 index\")\n",
735
+ " retriever = BM25Retriever.from_documents(self.documents)\n",
736
+ " retriever.k = 5\n",
737
+ " with open(cache_key, \"wb\") as f:\n",
738
+ " pickle.dump(retriever, f)\n",
739
+ " return retriever\n",
740
+ "\n",
741
+ " def _init_faiss(self) -> FAISS:\n",
742
+ " cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
743
+ " if os.path.exists(cache_key):\n",
744
+ " logger.info(\"Loading cached FAISS index\")\n",
745
+ " return FAISS.load_local(\n",
746
+ " cache_key,\n",
747
+ " MistralEmbeddings(),\n",
748
+ " allow_dangerous_deserialization=True\n",
749
+ " )\n",
750
+ " \n",
751
+ " logger.info(\"Building new FAISS index\")\n",
752
+ " vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
753
+ " vector_store.save_local(cache_key)\n",
754
+ " return vector_store\n",
755
+ "\n",
756
+ " @lru_cache(maxsize=500)\n",
757
+ " def retrieve(self, query: str) -> str:\n",
758
+ " try:\n",
759
+ " processed_query = self._preprocess_query(query)\n",
760
+ " expanded_query = self._hyde_expansion(processed_query)\n",
761
+ " \n",
762
+ " bm25_results = self.bm25.invoke(processed_query)\n",
763
+ " vector_results = self.vector_retriever.invoke(processed_query)\n",
764
+ " expanded_results = self.bm25.invoke(expanded_query)\n",
765
+ " \n",
766
+ " fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
767
+ " return self._format_context(fused_results[:5])\n",
768
+ " except Exception as e:\n",
769
+ " logger.error(f\"Retrieval Error: {str(e)}\")\n",
770
+ " return \"\"\n",
771
+ "\n",
772
+ " def _preprocess_query(self, query: str) -> str:\n",
773
+ " return query.lower().strip()\n",
774
+ "\n",
775
+ " @lru_cache(maxsize=500)\n",
776
+ " def _hyde_expansion(self, query: str) -> str:\n",
777
+ " try:\n",
778
+ " response = gemini_client.models.generate_content( # Use Gemini client for HyDE\n",
779
+ " model=generation_model,\n",
780
+ " contents=f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
781
+ " )\n",
782
+ " return response.text\n",
783
+ " except Exception as e:\n",
784
+ " logger.error(f\"HyDE Error: {str(e)}\")\n",
785
+ " return query\n",
786
+ "\n",
787
+ " def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
788
+ " fused_scores = defaultdict(float)\n",
789
+ " for docs in result_sets:\n",
790
+ " for rank, doc in enumerate(docs, 1):\n",
791
+ " fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
792
+ " \n",
793
+ " seen = set()\n",
794
+ " return [\n",
795
+ " doc for doc in sorted(\n",
796
+ " (doc for docs in result_sets for doc in docs),\n",
797
+ " key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
798
+ " reverse=True\n",
799
+ " ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
800
+ " ]\n",
801
+ "\n",
802
+ " def _format_context(self, docs: List[Document]) -> str:\n",
803
+ " context = []\n",
804
+ " for doc in docs:\n",
805
+ " context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
806
+ " **Application**: {doc.metadata['application']}\n",
807
+ " **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
808
+ " **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
809
+ " context.append(context_str)\n",
810
+ " return \"\\n\\n---\\n\\n\".join(context)\n",
811
+ "\n",
812
+ "# --- Generation System ---\n",
813
+ "SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
814
+ "1. Base answers strictly on context\n",
815
+ "2. **Bold** technical terms\n",
816
+ "3. Include reference links at the end of the response\n",
817
+ "\n",
818
+ "Context: {context}\"\"\"\n",
819
+ "\n",
820
+ "@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
821
+ "def get_ai_response(query: str, context: str) -> str:\n",
822
+ " try:\n",
823
+ " response = gemini_client.models.generate_content( # Use Gemini client for generation\n",
824
+ " model=generation_model,\n",
825
+ " contents=f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
826
+ " )\n",
827
+ " logger.info(f\"Raw Response: {response.text}\") # Log raw response\n",
828
+ " return _postprocess_response(response.text)\n",
829
+ " except Exception as e:\n",
830
+ " logger.error(f\"Generation Error: {str(e)}\")\n",
831
+ " return \"I'm unable to generate a response right now. Please try again later.\"\n",
832
+ "\n",
833
+ "def _postprocess_response(response: str) -> str:\n",
834
+ " response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
835
+ " response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
836
+ " return response\n",
837
+ "\n",
838
+ "# --- Optimized Pipeline ---\n",
839
+ "documents = load_and_chunk_data(data_file_name)\n",
840
+ "retriever = EnhancedRetriever(documents)\n",
841
+ "\n",
842
+ "def generate_response(question: str) -> str:\n",
843
+ " try:\n",
844
+ " context = retriever.retrieve(question)\n",
845
+ " return get_ai_response(question, context) if context else \"No relevant information found.\"\n",
846
+ " except Exception as e:\n",
847
+ " logger.error(f\"Pipeline Error: {str(e)}\")\n",
848
+ " return \"An error occurred processing your request.\"\n",
849
+ "\n",
850
+ "# --- Gradio Interface ---\n",
851
+ "def chat_interface(question: str, history: List[Tuple[str, str]]):\n",
852
+ " response = generate_response(question)\n",
853
+ " return \"\", history + [(question, response)]\n",
854
+ "\n",
855
+ "with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
856
+ " gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
857
+ " with gr.Row():\n",
858
+ " chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
859
+ " with gr.Row():\n",
860
+ " question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
861
+ " label=\"Inquiry\", scale=4)\n",
862
+ " clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
863
+ " \n",
864
+ " gr.Markdown(\"\"\"\n",
865
+ " <div style=\"text-align: center; color: #4a7c59;\">\n",
866
+ " <small>Powered by AskNature's Database | \n",
867
+ " Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
868
+ " </div>\"\"\")\n",
869
+ " question.submit(chat_interface, [question, chatbot], [question, chatbot])\n",
870
+ " clear_btn.click(lambda: [], None, chatbot)\n",
871
+ "\n",
872
+ "if __name__ == \"__main__\":\n",
873
+ " demo.launch(show_error=True)"
874
+ ]
875
+ },
876
+ {
877
+ "cell_type": "code",
878
+ "execution_count": null,
879
+ "metadata": {},
880
+ "outputs": [],
881
+ "source": []
882
+ },
883
+ {
884
+ "cell_type": "code",
885
+ "execution_count": null,
886
+ "metadata": {},
887
+ "outputs": [],
888
+ "source": []
889
+ },
890
+ {
891
+ "cell_type": "code",
892
+ "execution_count": null,
893
+ "metadata": {},
894
+ "outputs": [],
895
+ "source": [
896
+ "# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation with Rate Control\n",
897
+ "import json\n",
898
+ "import logging\n",
899
+ "import re\n",
900
+ "import os\n",
901
+ "import pickle\n",
902
+ "from typing import List, Tuple, Optional\n",
903
+ "import gradio as gr\n",
904
+ "from openai import OpenAI # For embeddings\n",
905
+ "from google import genai # For generation\n",
906
+ "from functools import lru_cache\n",
907
+ "from tenacity import retry, stop_after_attempt, wait_exponential\n",
908
+ "from langchain_community.retrievers import BM25Retriever\n",
909
+ "from langchain_community.vectorstores import FAISS\n",
910
+ "from langchain_core.embeddings import Embeddings\n",
911
+ "from langchain_core.documents import Document\n",
912
+ "from collections import defaultdict\n",
913
+ "import hashlib\n",
914
+ "from tqdm import tqdm # For progress tracking\n",
915
+ "import time # For rate limit testing\n",
916
+ "from threading import Thread # For concurrent requests\n",
917
+ "\n",
918
+ "from dotenv import load_dotenv\n",
919
+ "load_dotenv()\n",
920
+ "\n",
921
+ "# --- Configuration ---\n",
922
+ "FAISS_INDEX_PATH = \"faiss_index\"\n",
923
+ "BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
924
+ "CACHE_VERSION = \"v1\" # Increment when data format changes\n",
925
+ "embedding_model = \"e5-mistral-7b-instruct\" # OpenAI embedding model\n",
926
+ "generation_model = \"gemini-2.0-flash\" # Gemini generation model\n",
927
+ "data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
928
+ "EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls\n",
929
+ "\n",
930
+ "# List of Gemini API keys\n",
931
+ "GEMINI_API_KEYS = [\n",
932
+ " os.getenv(\"GEMINI_API_KEY_1\"),\n",
933
+ " os.getenv(\"GEMINI_API_KEY_2\")\n",
934
+ "]\n",
935
+ "\n",
936
+ "current_key_index = 0\n",
937
+ "\n",
938
+ "def get_gemini_client():\n",
939
+ " global current_key_index\n",
940
+ " api_key = GEMINI_API_KEYS[current_key_index]\n",
941
+ " print(f\"Using Gemini API Key: {api_key}\")\n",
942
+ " return genai.Client(api_key=api_key)\n",
943
+ "\n",
944
+ "def switch_gemini_key():\n",
945
+ " global current_key_index\n",
946
+ " current_key_index = (current_key_index + 1) % len(GEMINI_API_KEYS)\n",
947
+ " print(f\"Switched to Gemini API Key: {GEMINI_API_KEYS[current_key_index]}\")\n",
948
+ " return get_gemini_client()\n",
949
+ "\n",
950
+ "# Initialize clients\n",
951
+ "OPENAI_API_CONFIG = {\n",
952
+ " \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
953
+ " \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
954
+ "}\n",
955
+ "client = OpenAI(**OPENAI_API_CONFIG)\n",
956
+ "gemini_client = get_gemini_client() # Initialize with the first key\n",
957
+ "logging.basicConfig(level=logging.INFO)\n",
958
+ "logger = logging.getLogger(__name__)\n",
959
+ "\n",
960
+ "# --- Helper Functions ---\n",
961
+ "def get_data_hash(file_path: str) -> str:\n",
962
+ " \"\"\"Generate hash of data file for cache validation\"\"\"\n",
963
+ " with open(file_path, \"rb\") as f:\n",
964
+ " return hashlib.md5(f.read()).hexdigest()\n",
965
+ "\n",
966
+ "# --- Custom Embedding Handler with Progress Tracking ---\n",
967
+ "class MistralEmbeddings(Embeddings):\n",
968
+ " \"\"\"E5-Mistral-7B embedding adapter with error handling and progress tracking\"\"\"\n",
969
+ " def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
970
+ " embeddings = []\n",
971
+ " try:\n",
972
+ " # Process in batches with progress tracking\n",
973
+ " for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
974
+ " batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
975
+ " response = client.embeddings.create(\n",
976
+ " input=batch,\n",
977
+ " model=embedding_model,\n",
978
+ " encoding_format=\"float\"\n",
979
+ " )\n",
980
+ " embeddings.extend([e.embedding for e in response.data])\n",
981
+ " return embeddings\n",
982
+ " except Exception as e:\n",
983
+ " logger.error(f\"Embedding Error: {str(e)}\")\n",
984
+ " return [[] for _ in texts]\n",
985
+ " \n",
986
+ " def embed_query(self, text: str) -> List[float]:\n",
987
+ " return self.embed_documents([text])[0]\n",
988
+ "\n",
989
+ "# --- Data Processing with Cache Validation ---\n",
990
+ "def load_and_chunk_data(file_path: str) -> List[Document]:\n",
991
+ " \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
992
+ " current_hash = get_data_hash(file_path)\n",
993
+ " cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
994
+ " \n",
995
+ " if os.path.exists(cache_file):\n",
996
+ " logger.info(\"Loading cached documents\")\n",
997
+ " with open(cache_file, \"rb\") as f:\n",
998
+ " return pickle.load(f)\n",
999
+ " \n",
1000
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
1001
+ " data = json.load(f)\n",
1002
+ " \n",
1003
+ " documents = []\n",
1004
+ " for item in tqdm(data, desc=\"Chunking Progress\"):\n",
1005
+ " base_content = f\"\"\"Source: {item['Source']}\n",
1006
+ "Application: {item['Application']}\n",
1007
+ "Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
1008
+ "Technical Concepts: {', '.join(item['technical_concepts'])}\n",
1009
+ "Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
1010
+ " \n",
1011
+ " strategy = item['Strategy']\n",
1012
+ " for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
1013
+ " chunk = strategy[i:i + CHUNK_SIZE]\n",
1014
+ " documents.append(Document(\n",
1015
+ " page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
1016
+ " metadata={\n",
1017
+ " \"source\": item[\"Source\"],\n",
1018
+ " \"application\": item[\"Application\"],\n",
1019
+ " \"technical_concepts\": item[\"technical_concepts\"],\n",
1020
+ " \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
1021
+ " \"hyperlink\": item[\"Hyperlink\"],\n",
1022
+ " \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
1023
+ " }\n",
1024
+ " ))\n",
1025
+ " \n",
1026
+ " with open(cache_file, \"wb\") as f:\n",
1027
+ " pickle.dump(documents, f)\n",
1028
+ " return documents\n",
1029
+ "\n",
1030
+ "# --- Optimized Retrieval System ---\n",
1031
+ "class EnhancedRetriever:\n",
1032
+ " \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
1033
+ " def __init__(self, documents: List[Document]):\n",
1034
+ " self.documents = documents\n",
1035
+ " self.bm25 = self._init_bm25()\n",
1036
+ " self.vector_store = self._init_faiss()\n",
1037
+ " self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
1038
+ "\n",
1039
+ " def _init_bm25(self) -> BM25Retriever:\n",
1040
+ " cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
1041
+ " if os.path.exists(cache_key):\n",
1042
+ " logger.info(\"Loading cached BM25 index\")\n",
1043
+ " with open(cache_key, \"rb\") as f:\n",
1044
+ " return pickle.load(f)\n",
1045
+ " \n",
1046
+ " logger.info(\"Building new BM25 index\")\n",
1047
+ " retriever = BM25Retriever.from_documents(self.documents)\n",
1048
+ " retriever.k = 5\n",
1049
+ " with open(cache_key, \"wb\") as f:\n",
1050
+ " pickle.dump(retriever, f)\n",
1051
+ " return retriever\n",
1052
+ "\n",
1053
+ " def _init_faiss(self) -> FAISS:\n",
1054
+ " cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
1055
+ " if os.path.exists(cache_key):\n",
1056
+ " logger.info(\"Loading cached FAISS index\")\n",
1057
+ " return FAISS.load_local(\n",
1058
+ " cache_key,\n",
1059
+ " MistralEmbeddings(),\n",
1060
+ " allow_dangerous_deserialization=True\n",
1061
+ " )\n",
1062
+ " \n",
1063
+ " logger.info(\"Building new FAISS index\")\n",
1064
+ " vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
1065
+ " vector_store.save_local(cache_key)\n",
1066
+ " return vector_store\n",
1067
+ "\n",
1068
+ " @lru_cache(maxsize=500)\n",
1069
+ " def retrieve(self, query: str) -> str:\n",
1070
+ " try:\n",
1071
+ " processed_query = self._preprocess_query(query)\n",
1072
+ " expanded_query = self._hyde_expansion(processed_query)\n",
1073
+ " \n",
1074
+ " bm25_results = self.bm25.invoke(processed_query)\n",
1075
+ " vector_results = self.vector_retriever.invoke(processed_query)\n",
1076
+ " expanded_results = self.bm25.invoke(expanded_query)\n",
1077
+ " \n",
1078
+ " fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
1079
+ " return self._format_context(fused_results[:5])\n",
1080
+ " except Exception as e:\n",
1081
+ " logger.error(f\"Retrieval Error: {str(e)}\")\n",
1082
+ " return \"\"\n",
1083
+ "\n",
1084
+ " def _preprocess_query(self, query: str) -> str:\n",
1085
+ " return query.lower().strip()\n",
1086
+ "\n",
1087
+ " @lru_cache(maxsize=500)\n",
1088
+ " def _hyde_expansion(self, query: str) -> str:\n",
1089
+ " try:\n",
1090
+ " response = gemini_client.models.generate_content( # Use Gemini client for HyDE\n",
1091
+ " model=generation_model,\n",
1092
+ " contents=f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
1093
+ " )\n",
1094
+ " return response.text\n",
1095
+ " except Exception as e:\n",
1096
+ " logger.error(f\"HyDE Error: {str(e)}\")\n",
1097
+ " return query\n",
1098
+ "\n",
1099
+ " def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
1100
+ " fused_scores = defaultdict(float)\n",
1101
+ " for docs in result_sets:\n",
1102
+ " for rank, doc in enumerate(docs, 1):\n",
1103
+ " fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
1104
+ " \n",
1105
+ " seen = set()\n",
1106
+ " return [\n",
1107
+ " doc for doc in sorted(\n",
1108
+ " (doc for docs in result_sets for doc in docs),\n",
1109
+ " key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
1110
+ " reverse=True\n",
1111
+ " ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
1112
+ " ]\n",
1113
+ "\n",
1114
+ " def _format_context(self, docs: List[Document]) -> str:\n",
1115
+ " context = []\n",
1116
+ " for doc in docs:\n",
1117
+ " context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
1118
+ " **Application**: {doc.metadata['application']}\n",
1119
+ " **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
1120
+ " **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
1121
+ " context.append(context_str)\n",
1122
+ " return \"\\n\\n---\\n\\n\".join(context)\n",
1123
+ "\n",
1124
+ "# --- Generation System ---\n",
1125
+ "SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
1126
+ "1. Base answers strictly on context\n",
1127
+ "2. **Bold** technical terms\n",
1128
+ "3. Include reference links at the end of the response\n",
1129
+ "\n",
1130
+ "Context: {context}\"\"\"\n",
1131
+ "\n",
1132
+ "@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
1133
+ "def get_ai_response(query: str, context: str) -> str:\n",
1134
+ " global gemini_client\n",
1135
+ " try:\n",
1136
+ " # Simulate a rate limit error for testing\n",
1137
+ " if \"test\" in query.lower():\n",
1138
+ " raise Exception(\"Simulated rate limit error\")\n",
1139
+ " \n",
1140
+ " response = gemini_client.models.generate_content( # Use Gemini client for generation\n",
1141
+ " model=generation_model,\n",
1142
+ " contents=f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
1143
+ " )\n",
1144
+ " logger.info(f\"Raw Response: {response.text}\") # Log raw response\n",
1145
+ " return _postprocess_response(response.text)\n",
1146
+ " except Exception as e:\n",
1147
+ " logger.error(f\"Generation Error: {str(e)}\")\n",
1148
+ " gemini_client = switch_gemini_key() # Switch to the next API key\n",
1149
+ " return \"I'm unable to generate a response right now. Please try again later.\"\n",
1150
+ "\n",
1151
+ "def _postprocess_response(response: str) -> str:\n",
1152
+ " response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
1153
+ " response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
1154
+ " return response\n",
1155
+ "\n",
1156
+ "# --- Optimized Pipeline ---\n",
1157
+ "documents = load_and_chunk_data(data_file_name)\n",
1158
+ "retriever = EnhancedRetriever(documents)\n",
1159
+ "\n",
1160
+ "def generate_response(question: str) -> str:\n",
1161
+ " try:\n",
1162
+ " context = retriever.retrieve(question)\n",
1163
+ " return get_ai_response(question, context) if context else \"No relevant information found.\"\n",
1164
+ " except Exception as e:\n",
1165
+ " logger.error(f\"Pipeline Error: {str(e)}\")\n",
1166
+ " return \"An error occurred processing your request.\"\n",
1167
+ "\n",
1168
+ "# --- Gradio Interface ---\n",
1169
+ "def chat_interface(question: str, history: List[Tuple[str, str]]):\n",
1170
+ " response = generate_response(question)\n",
1171
+ " return \"\", history + [(question, response)]\n",
1172
+ "\n",
1173
+ "with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
1174
+ " gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
1175
+ " with gr.Row():\n",
1176
+ " chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
1177
+ " with gr.Row():\n",
1178
+ " question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
1179
+ " label=\"Inquiry\", scale=4)\n",
1180
+ " clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
1181
+ " \n",
1182
+ " gr.Markdown(\"\"\"\n",
1183
+ " <div style=\"text-align: center; color: #4a7c59;\">\n",
1184
+ " <small>Powered by AskNature's Database | \n",
1185
+ " Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
1186
+ " </div>\"\"\")\n",
1187
+ " question.submit(chat_interface, [question, chatbot], [question, chatbot])\n",
1188
+ " clear_btn.click(lambda: [], None, chatbot)\n",
1189
+ "\n",
1190
+ "# --- Rate Limit Testing ---\n",
1191
+ "def test_rate_limit():\n",
1192
+ " \"\"\"Simulate high-volume requests to test rate limit handling\"\"\"\n",
1193
+ " test_questions = [\n",
1194
+ " \"How do coral proteins help make eco-friendly fabrics without dyes?\",\n",
1195
+ " \"What environmental problems do coral-inspired textiles solve?\",\n",
1196
+ " \"What is industrial symbiosis and how does the Kalundborg example work?\",\n",
1197
+ " \"How do Metavision sensors work like human eyes to save energy?\",\n",
1198
+ " \"How does TISSIUM copy skin proteins for medical adhesives?\",\n",
1199
+ " \"How does DNA-level design create better fibers inspired by nature?\",\n",
1200
+ " \"Why is industrial symbiosis hard to implement despite benefits?\",\n",
1201
+ " \"How can biological systems inspire sustainable manufacturing?\",\n",
1202
+ " \"What other industries can use protein-based materials like Werewool?\",\n",
1203
+ " \"How could event-based cameras improve security systems?\",\n",
1204
+ " \"Design a factory network that works like coral reef partnerships - what features would it need?\"\n",
1205
+ " ]\n",
1206
+ "\n",
1207
+ " for i, question in enumerate(test_questions):\n",
1208
+ " print(f\"\\nSending query {i+1}: {question}\")\n",
1209
+ " response = generate_response(question)\n",
1210
+ " print(f\"Response: {response}\")\n",
1211
+ " time.sleep(0.5) # Add a small delay between requests\n",
1212
+ "\n",
1213
+ "# Run the rate limit test in a separate thread\n",
1214
+ "if __name__ == \"__main__\":\n",
1215
+ " gradio_thread = Thread(target=demo.launch, kwargs={\"show_error\": True})\n",
1216
+ " gradio_thread.start()\n",
1217
+ " time.sleep(5)\n",
1218
+ " test_rate_limit()"
1219
+ ]
1220
+ },
1221
+ {
1222
+ "cell_type": "code",
1223
+ "execution_count": null,
1224
+ "metadata": {},
1225
+ "outputs": [],
1226
+ "source": []
1227
+ }
1228
+ ],
1229
+ "metadata": {
1230
+ "kernelspec": {
1231
+ "display_name": "rag",
1232
+ "language": "python",
1233
+ "name": "python3"
1234
+ },
1235
+ "language_info": {
1236
+ "codemirror_mode": {
1237
+ "name": "ipython",
1238
+ "version": 3
1239
+ },
1240
+ "file_extension": ".py",
1241
+ "mimetype": "text/x-python",
1242
+ "name": "python",
1243
+ "nbconvert_exporter": "python",
1244
+ "pygments_lexer": "ipython3",
1245
+ "version": "3.12.8"
1246
+ }
1247
+ },
1248
+ "nbformat": 4,
1249
+ "nbformat_minor": 2
1250
+ }
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ openai
4
+ tenacity
5
+ langchain-core
6
+ langchain-community
7
+ langchain-llm
8
+ protobuf
9
+ numpy
10
+ scipy
11
+ faiss-cpu
12
+ transformers
13
+ sentencepiece
14
+ regex
15
+ json5
16
+ rank_bm25
17
+ huggingface_hub
18
+ tqdm
19
+ sentence-transformers
20
+ <<<<<<< HEAD
21
+ google
22
+ google-cloud
23
+ =======
24
+ google
25
+ google-cloud
26
+ >>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b