Mohamed284 commited on
Commit
389dc82
·
verified ·
1 Parent(s): 5458e43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -31
app.py CHANGED
@@ -7,8 +7,8 @@ import os
7
  import pickle
8
  from typing import List, Tuple, Optional
9
  import gradio as gr
10
- from openai import OpenAI
11
- from google import genai
12
  from functools import lru_cache
13
  from tenacity import retry, stop_after_attempt, wait_exponential
14
  from langchain_community.retrievers import BM25Retriever
@@ -17,24 +17,18 @@ from langchain_core.embeddings import Embeddings
17
  from langchain_core.documents import Document
18
  from collections import defaultdict
19
  import hashlib
20
- from tqdm import tqdm
21
 
22
  from dotenv import load_dotenv
23
  load_dotenv()
 
24
  # --- Configuration ---
25
  FAISS_INDEX_PATH = "faiss_index"
26
  BM25_INDEX_PATH = "bm25_index.pkl"
27
- CACHE_VERSION = "v1" # Increment when data format changes
28
- embedding_model = "e5-mistral-7b-instruct" # OpenAI embedding model
29
- generation_model = "gemini-2.0-flash" # Gemini generation model
30
  data_file_name = "AskNatureNet_data_enhanced.json"
31
- API_CONFIG = {
32
- "gemini_api_key": os.getenv("GEMINI_API_KEY") # Gemini API key for generation
33
- }
34
-
35
- CHUNK_SIZE = 800
36
- OVERLAP = 200
37
- EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls
38
 
39
  # Initialize clients
40
  OPENAI_API_CONFIG = {
@@ -42,7 +36,11 @@ OPENAI_API_CONFIG = {
42
  "base_url": "https://chat-ai.academiccloud.de/v1"
43
  }
44
  client = OpenAI(**OPENAI_API_CONFIG)
45
- gemini_client = genai.Client(api_key=API_CONFIG["gemini_api_key"]) # Gemini client for generation
 
 
 
 
46
  logging.basicConfig(level=logging.INFO)
47
  logger = logging.getLogger(__name__)
48
 
@@ -52,13 +50,12 @@ def get_data_hash(file_path: str) -> str:
52
  with open(file_path, "rb") as f:
53
  return hashlib.md5(f.read()).hexdigest()
54
 
55
- # --- Custom Embedding Handler with Progress Tracking ---
56
  class MistralEmbeddings(Embeddings):
57
- """E5-Mistral-7B embedding adapter with error handling and progress tracking"""
58
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
59
  embeddings = []
60
  try:
61
- # Process in batches with progress tracking
62
  for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
63
  batch = texts[i:i + EMBEDDING_BATCH_SIZE]
64
  response = client.embeddings.create(
@@ -75,7 +72,7 @@ class MistralEmbeddings(Embeddings):
75
  def embed_query(self, text: str) -> List[float]:
76
  return self.embed_documents([text])[0]
77
 
78
- # --- Data Processing with Cache Validation ---
79
  def load_and_chunk_data(file_path: str) -> List[Document]:
80
  """Enhanced chunking with metadata preservation"""
81
  current_hash = get_data_hash(file_path)
@@ -176,9 +173,8 @@ class EnhancedRetriever:
176
  @lru_cache(maxsize=500)
177
  def _hyde_expansion(self, query: str) -> str:
178
  try:
179
- response = gemini_client.models.generate_content( # Use Gemini client for HyDE
180
- model=generation_model,
181
- contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
182
  )
183
  return response.text
184
  except Exception as e:
@@ -221,11 +217,10 @@ Context: {context}"""
221
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
222
  def get_ai_response(query: str, context: str) -> str:
223
  try:
224
- response = gemini_client.models.generate_content( # Use Gemini client for generation
225
- model=generation_model,
226
- contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
227
  )
228
- logger.info(f"Raw Response: {response.text}") # Log raw response
229
  return _postprocess_response(response.text)
230
  except Exception as e:
231
  logger.error(f"Generation Error: {str(e)}")
@@ -236,7 +231,7 @@ def _postprocess_response(response: str) -> str:
236
  response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
237
  return response
238
 
239
- # --- Optimized Pipeline ---
240
  documents = load_and_chunk_data(data_file_name)
241
  retriever = EnhancedRetriever(documents)
242
 
@@ -262,11 +257,9 @@ with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
262
  label="Inquiry", scale=4)
263
  clear_btn = gr.Button("Clear History", variant="secondary")
264
 
265
- gr.Markdown("""
266
- <div style="text-align: center; color: #4a7c59;">
267
- <small>Powered by AskNature's Database |
268
- Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
269
- </div>""")
270
  question.submit(chat_interface, [question, chatbot], [question, chatbot])
271
  clear_btn.click(lambda: [], None, chatbot)
272
 
 
7
  import pickle
8
  from typing import List, Tuple, Optional
9
  import gradio as gr
10
+ from openai import OpenAI
11
+ import google.generativeai as genai
12
  from functools import lru_cache
13
  from tenacity import retry, stop_after_attempt, wait_exponential
14
  from langchain_community.retrievers import BM25Retriever
 
17
  from langchain_core.documents import Document
18
  from collections import defaultdict
19
  import hashlib
20
+ from tqdm import tqdm
21
 
22
  from dotenv import load_dotenv
23
  load_dotenv()
24
+
25
  # --- Configuration ---
26
  FAISS_INDEX_PATH = "faiss_index"
27
  BM25_INDEX_PATH = "bm25_index.pkl"
28
+ CACHE_VERSION = "v1"
29
+ embedding_model = "e5-mistral-7b-instruct"
30
+ generation_model = "gemini-1.5-flash"
31
  data_file_name = "AskNatureNet_data_enhanced.json"
 
 
 
 
 
 
 
32
 
33
  # Initialize clients
34
  OPENAI_API_CONFIG = {
 
36
  "base_url": "https://chat-ai.academiccloud.de/v1"
37
  }
38
  client = OpenAI(**OPENAI_API_CONFIG)
39
+
40
+ # Configure Gemini
41
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
42
+ gemini_model = genai.GenerativeModel(generation_model)
43
+
44
  logging.basicConfig(level=logging.INFO)
45
  logger = logging.getLogger(__name__)
46
 
 
50
  with open(file_path, "rb") as f:
51
  return hashlib.md5(f.read()).hexdigest()
52
 
53
+ # --- Custom Embedding Handler ---
54
  class MistralEmbeddings(Embeddings):
55
+ """E5-Mistral-7B embedding adapter"""
56
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
57
  embeddings = []
58
  try:
 
59
  for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
60
  batch = texts[i:i + EMBEDDING_BATCH_SIZE]
61
  response = client.embeddings.create(
 
72
  def embed_query(self, text: str) -> List[float]:
73
  return self.embed_documents([text])[0]
74
 
75
+ # --- Data Processing ---
76
  def load_and_chunk_data(file_path: str) -> List[Document]:
77
  """Enhanced chunking with metadata preservation"""
78
  current_hash = get_data_hash(file_path)
 
173
  @lru_cache(maxsize=500)
174
  def _hyde_expansion(self, query: str) -> str:
175
  try:
176
+ response = gemini_model.generate_content(
177
+ f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
 
178
  )
179
  return response.text
180
  except Exception as e:
 
217
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
218
  def get_ai_response(query: str, context: str) -> str:
219
  try:
220
+ response = gemini_model.generate_content(
221
+ f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
 
222
  )
223
+ logger.info(f"Raw Response: {response.text}")
224
  return _postprocess_response(response.text)
225
  except Exception as e:
226
  logger.error(f"Generation Error: {str(e)}")
 
231
  response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
232
  return response
233
 
234
+ # --- Pipeline ---
235
  documents = load_and_chunk_data(data_file_name)
236
  retriever = EnhancedRetriever(documents)
237
 
 
257
  label="Inquiry", scale=4)
258
  clear_btn = gr.Button("Clear History", variant="secondary")
259
 
260
+ gr.Markdown("""<div style="text-align: center; color: #4a7c59;">
261
+ <small>Powered by AskNature's Database |
262
+ Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small></div>""")
 
 
263
  question.submit(chat_interface, [question, chatbot], [question, chatbot])
264
  clear_btn.click(lambda: [], None, chatbot)
265