Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,8 +7,8 @@ import os
|
|
| 7 |
import pickle
|
| 8 |
from typing import List, Tuple, Optional
|
| 9 |
import gradio as gr
|
| 10 |
-
from openai import OpenAI
|
| 11 |
-
|
| 12 |
from functools import lru_cache
|
| 13 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 14 |
from langchain_community.retrievers import BM25Retriever
|
|
@@ -17,24 +17,18 @@ from langchain_core.embeddings import Embeddings
|
|
| 17 |
from langchain_core.documents import Document
|
| 18 |
from collections import defaultdict
|
| 19 |
import hashlib
|
| 20 |
-
from tqdm import tqdm
|
| 21 |
|
| 22 |
from dotenv import load_dotenv
|
| 23 |
load_dotenv()
|
|
|
|
| 24 |
# --- Configuration ---
|
| 25 |
FAISS_INDEX_PATH = "faiss_index"
|
| 26 |
BM25_INDEX_PATH = "bm25_index.pkl"
|
| 27 |
-
CACHE_VERSION = "v1"
|
| 28 |
-
embedding_model = "e5-mistral-7b-instruct"
|
| 29 |
-
generation_model = "gemini-
|
| 30 |
data_file_name = "AskNatureNet_data_enhanced.json"
|
| 31 |
-
API_CONFIG = {
|
| 32 |
-
"gemini_api_key": os.getenv("GEMINI_API_KEY") # Gemini API key for generation
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
CHUNK_SIZE = 800
|
| 36 |
-
OVERLAP = 200
|
| 37 |
-
EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls
|
| 38 |
|
| 39 |
# Initialize clients
|
| 40 |
OPENAI_API_CONFIG = {
|
|
@@ -42,7 +36,11 @@ OPENAI_API_CONFIG = {
|
|
| 42 |
"base_url": "https://chat-ai.academiccloud.de/v1"
|
| 43 |
}
|
| 44 |
client = OpenAI(**OPENAI_API_CONFIG)
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
logging.basicConfig(level=logging.INFO)
|
| 47 |
logger = logging.getLogger(__name__)
|
| 48 |
|
|
@@ -52,13 +50,12 @@ def get_data_hash(file_path: str) -> str:
|
|
| 52 |
with open(file_path, "rb") as f:
|
| 53 |
return hashlib.md5(f.read()).hexdigest()
|
| 54 |
|
| 55 |
-
# --- Custom Embedding Handler
|
| 56 |
class MistralEmbeddings(Embeddings):
|
| 57 |
-
"""E5-Mistral-7B embedding adapter
|
| 58 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 59 |
embeddings = []
|
| 60 |
try:
|
| 61 |
-
# Process in batches with progress tracking
|
| 62 |
for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
|
| 63 |
batch = texts[i:i + EMBEDDING_BATCH_SIZE]
|
| 64 |
response = client.embeddings.create(
|
|
@@ -75,7 +72,7 @@ class MistralEmbeddings(Embeddings):
|
|
| 75 |
def embed_query(self, text: str) -> List[float]:
|
| 76 |
return self.embed_documents([text])[0]
|
| 77 |
|
| 78 |
-
# --- Data Processing
|
| 79 |
def load_and_chunk_data(file_path: str) -> List[Document]:
|
| 80 |
"""Enhanced chunking with metadata preservation"""
|
| 81 |
current_hash = get_data_hash(file_path)
|
|
@@ -176,9 +173,8 @@ class EnhancedRetriever:
|
|
| 176 |
@lru_cache(maxsize=500)
|
| 177 |
def _hyde_expansion(self, query: str) -> str:
|
| 178 |
try:
|
| 179 |
-
response =
|
| 180 |
-
|
| 181 |
-
contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
|
| 182 |
)
|
| 183 |
return response.text
|
| 184 |
except Exception as e:
|
|
@@ -221,11 +217,10 @@ Context: {context}"""
|
|
| 221 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
|
| 222 |
def get_ai_response(query: str, context: str) -> str:
|
| 223 |
try:
|
| 224 |
-
response =
|
| 225 |
-
|
| 226 |
-
contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
|
| 227 |
)
|
| 228 |
-
logger.info(f"Raw Response: {response.text}")
|
| 229 |
return _postprocess_response(response.text)
|
| 230 |
except Exception as e:
|
| 231 |
logger.error(f"Generation Error: {str(e)}")
|
|
@@ -236,7 +231,7 @@ def _postprocess_response(response: str) -> str:
|
|
| 236 |
response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
|
| 237 |
return response
|
| 238 |
|
| 239 |
-
# ---
|
| 240 |
documents = load_and_chunk_data(data_file_name)
|
| 241 |
retriever = EnhancedRetriever(documents)
|
| 242 |
|
|
@@ -262,11 +257,9 @@ with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
|
|
| 262 |
label="Inquiry", scale=4)
|
| 263 |
clear_btn = gr.Button("Clear History", variant="secondary")
|
| 264 |
|
| 265 |
-
gr.Markdown("""
|
| 266 |
-
<
|
| 267 |
-
|
| 268 |
-
Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
|
| 269 |
-
</div>""")
|
| 270 |
question.submit(chat_interface, [question, chatbot], [question, chatbot])
|
| 271 |
clear_btn.click(lambda: [], None, chatbot)
|
| 272 |
|
|
|
|
| 7 |
import pickle
|
| 8 |
from typing import List, Tuple, Optional
|
| 9 |
import gradio as gr
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
import google.generativeai as genai
|
| 12 |
from functools import lru_cache
|
| 13 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 14 |
from langchain_community.retrievers import BM25Retriever
|
|
|
|
| 17 |
from langchain_core.documents import Document
|
| 18 |
from collections import defaultdict
|
| 19 |
import hashlib
|
| 20 |
+
from tqdm import tqdm
|
| 21 |
|
| 22 |
from dotenv import load_dotenv
|
| 23 |
load_dotenv()
|
| 24 |
+
|
| 25 |
# --- Configuration ---
|
| 26 |
FAISS_INDEX_PATH = "faiss_index"
|
| 27 |
BM25_INDEX_PATH = "bm25_index.pkl"
|
| 28 |
+
CACHE_VERSION = "v1"
|
| 29 |
+
embedding_model = "e5-mistral-7b-instruct"
|
| 30 |
+
generation_model = "gemini-1.5-flash"
|
| 31 |
data_file_name = "AskNatureNet_data_enhanced.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Initialize clients
|
| 34 |
OPENAI_API_CONFIG = {
|
|
|
|
| 36 |
"base_url": "https://chat-ai.academiccloud.de/v1"
|
| 37 |
}
|
| 38 |
client = OpenAI(**OPENAI_API_CONFIG)
|
| 39 |
+
|
| 40 |
+
# Configure Gemini
|
| 41 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 42 |
+
gemini_model = genai.GenerativeModel(generation_model)
|
| 43 |
+
|
| 44 |
logging.basicConfig(level=logging.INFO)
|
| 45 |
logger = logging.getLogger(__name__)
|
| 46 |
|
|
|
|
| 50 |
with open(file_path, "rb") as f:
|
| 51 |
return hashlib.md5(f.read()).hexdigest()
|
| 52 |
|
| 53 |
+
# --- Custom Embedding Handler ---
|
| 54 |
class MistralEmbeddings(Embeddings):
|
| 55 |
+
"""E5-Mistral-7B embedding adapter"""
|
| 56 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 57 |
embeddings = []
|
| 58 |
try:
|
|
|
|
| 59 |
for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
|
| 60 |
batch = texts[i:i + EMBEDDING_BATCH_SIZE]
|
| 61 |
response = client.embeddings.create(
|
|
|
|
| 72 |
def embed_query(self, text: str) -> List[float]:
|
| 73 |
return self.embed_documents([text])[0]
|
| 74 |
|
| 75 |
+
# --- Data Processing ---
|
| 76 |
def load_and_chunk_data(file_path: str) -> List[Document]:
|
| 77 |
"""Enhanced chunking with metadata preservation"""
|
| 78 |
current_hash = get_data_hash(file_path)
|
|
|
|
| 173 |
@lru_cache(maxsize=500)
|
| 174 |
def _hyde_expansion(self, query: str) -> str:
|
| 175 |
try:
|
| 176 |
+
response = gemini_model.generate_content(
|
| 177 |
+
f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
|
|
|
|
| 178 |
)
|
| 179 |
return response.text
|
| 180 |
except Exception as e:
|
|
|
|
| 217 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
|
| 218 |
def get_ai_response(query: str, context: str) -> str:
|
| 219 |
try:
|
| 220 |
+
response = gemini_model.generate_content(
|
| 221 |
+
f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
|
|
|
|
| 222 |
)
|
| 223 |
+
logger.info(f"Raw Response: {response.text}")
|
| 224 |
return _postprocess_response(response.text)
|
| 225 |
except Exception as e:
|
| 226 |
logger.error(f"Generation Error: {str(e)}")
|
|
|
|
| 231 |
response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
|
| 232 |
return response
|
| 233 |
|
| 234 |
+
# --- Pipeline ---
|
| 235 |
documents = load_and_chunk_data(data_file_name)
|
| 236 |
retriever = EnhancedRetriever(documents)
|
| 237 |
|
|
|
|
| 257 |
label="Inquiry", scale=4)
|
| 258 |
clear_btn = gr.Button("Clear History", variant="secondary")
|
| 259 |
|
| 260 |
+
gr.Markdown("""<div style="text-align: center; color: #4a7c59;">
|
| 261 |
+
<small>Powered by AskNature's Database |
|
| 262 |
+
Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small></div>""")
|
|
|
|
|
|
|
| 263 |
question.submit(chat_interface, [question, chatbot], [question, chatbot])
|
| 264 |
clear_btn.click(lambda: [], None, chatbot)
|
| 265 |
|