Spaces:
Build error
Build error
Upload 8 files
Browse files- .env +18 -0
- .gitattributes +2 -0
- .gitignore +5 -0
- AskNatureNet_data_enhanced.json +0 -0
- app.py +548 -0
- bm25_index.pkl_5c0c37d3cbc20e235eeec7cffd2d312f +3 -0
- documents_v1_5c0c37d3cbc20e235eeec7cffd2d312f.pkl +3 -0
- main.ipynb +1250 -0
- requirements.txt +26 -0
.env
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
+
# API Configuration
|
| 3 |
+
OPENAI_API_KEY="d1c9ed1ca70b9721dee1087d93f9662a"
|
| 4 |
+
GEMINI_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
|
| 5 |
+
# GCP_PROJECT_ID="1008673779731"
|
| 6 |
+
# GCP_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
|
| 7 |
+
|
| 8 |
+
GEMINI_API_KEY_1= "AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
|
| 9 |
+
=======
|
| 10 |
+
# API Configuration
|
| 11 |
+
OPENAI_API_KEY="d1c9ed1ca70b9721dee1087d93f9662a"
|
| 12 |
+
GEMINI_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
|
| 13 |
+
# GCP_PROJECT_ID="1008673779731"
|
| 14 |
+
# GCP_API_KEY="AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
|
| 15 |
+
|
| 16 |
+
GEMINI_API_KEY_1= "AIzaSyDDWHYpQKQ5glnQn5Q-kMTjliwpNfYBpeY"
|
| 17 |
+
>>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
|
| 18 |
+
GEMINI_API_KEY_2= "AIzaSyDzQSzM9vA6Le36V65I2meN5URclq4JSx0"
|
.gitattributes
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
bm25_index.pkl_5c0c37d3cbc20e235eeec7cffd2d312f filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
documents_v1_5c0c37d3cbc20e235eeec7cffd2d312f.pkl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
+
".env"
|
| 3 |
+
=======
|
| 4 |
+
".env"
|
| 5 |
+
>>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
|
AskNatureNet_data_enhanced.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import re
|
| 5 |
+
import os
|
| 6 |
+
import pickle
|
| 7 |
+
from typing import List, Tuple, Optional
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
from google import genai
|
| 11 |
+
from functools import lru_cache
|
| 12 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 13 |
+
from langchain_community.retrievers import BM25Retriever
|
| 14 |
+
from langchain_community.vectorstores import FAISS
|
| 15 |
+
from langchain_core.embeddings import Embeddings
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
from collections import defaultdict
|
| 18 |
+
import hashlib
|
| 19 |
+
from tqdm import tqdm
|
| 20 |
+
|
| 21 |
+
from dotenv import load_dotenv
|
| 22 |
+
load_dotenv()
|
| 23 |
+
# --- Configuration ---
|
| 24 |
+
FAISS_INDEX_PATH = "faiss_index"
|
| 25 |
+
BM25_INDEX_PATH = "bm25_index.pkl"
|
| 26 |
+
CACHE_VERSION = "v1" # Increment when data format changes
|
| 27 |
+
embedding_model = "e5-mistral-7b-instruct" # OpenAI embedding model
|
| 28 |
+
generation_model = "gemini-2.0-flash" # Gemini generation model
|
| 29 |
+
data_file_name = "AskNatureNet_data_enhanced.json"
|
| 30 |
+
API_CONFIG = {
|
| 31 |
+
"gemini_api_key": os.getenv("GEMINI_API_KEY") # Gemini API key for generation
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
CHUNK_SIZE = 800
|
| 35 |
+
OVERLAP = 200
|
| 36 |
+
EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls
|
| 37 |
+
|
| 38 |
+
# Initialize clients
|
| 39 |
+
OPENAI_API_CONFIG = {
|
| 40 |
+
"api_key": os.getenv("OPENAI_API_KEY"),
|
| 41 |
+
"base_url": "https://chat-ai.academiccloud.de/v1"
|
| 42 |
+
}
|
| 43 |
+
client = OpenAI(**OPENAI_API_CONFIG)
|
| 44 |
+
gemini_client = genai.Client(api_key=API_CONFIG["gemini_api_key"]) # Gemini client for generation
|
| 45 |
+
logging.basicConfig(level=logging.INFO)
|
| 46 |
+
logger = logging.getLogger(__name__)
|
| 47 |
+
|
| 48 |
+
# --- Helper Functions ---
|
| 49 |
+
def get_data_hash(file_path: str) -> str:
|
| 50 |
+
"""Generate hash of data file for cache validation"""
|
| 51 |
+
with open(file_path, "rb") as f:
|
| 52 |
+
return hashlib.md5(f.read()).hexdigest()
|
| 53 |
+
|
| 54 |
+
# --- Custom Embedding Handler with Progress Tracking ---
|
| 55 |
+
class MistralEmbeddings(Embeddings):
|
| 56 |
+
"""E5-Mistral-7B embedding adapter with error handling and progress tracking"""
|
| 57 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 58 |
+
embeddings = []
|
| 59 |
+
try:
|
| 60 |
+
# Process in batches with progress tracking
|
| 61 |
+
for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
|
| 62 |
+
batch = texts[i:i + EMBEDDING_BATCH_SIZE]
|
| 63 |
+
response = client.embeddings.create(
|
| 64 |
+
input=batch,
|
| 65 |
+
model=embedding_model,
|
| 66 |
+
encoding_format="float"
|
| 67 |
+
)
|
| 68 |
+
embeddings.extend([e.embedding for e in response.data])
|
| 69 |
+
return embeddings
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Embedding Error: {str(e)}")
|
| 72 |
+
return [[] for _ in texts]
|
| 73 |
+
|
| 74 |
+
def embed_query(self, text: str) -> List[float]:
|
| 75 |
+
return self.embed_documents([text])[0]
|
| 76 |
+
|
| 77 |
+
# --- Data Processing with Cache Validation ---
|
| 78 |
+
def load_and_chunk_data(file_path: str) -> List[Document]:
|
| 79 |
+
"""Enhanced chunking with metadata preservation"""
|
| 80 |
+
current_hash = get_data_hash(file_path)
|
| 81 |
+
cache_file = f"documents_{CACHE_VERSION}_{current_hash}.pkl"
|
| 82 |
+
|
| 83 |
+
if os.path.exists(cache_file):
|
| 84 |
+
logger.info("Loading cached documents")
|
| 85 |
+
with open(cache_file, "rb") as f:
|
| 86 |
+
return pickle.load(f)
|
| 87 |
+
|
| 88 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 89 |
+
data = json.load(f)
|
| 90 |
+
|
| 91 |
+
documents = []
|
| 92 |
+
for item in tqdm(data, desc="Chunking Progress"):
|
| 93 |
+
base_content = f"""Source: {item['Source']}
|
| 94 |
+
Application: {item['Application']}
|
| 95 |
+
Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}
|
| 96 |
+
Technical Concepts: {', '.join(item['technical_concepts'])}
|
| 97 |
+
Biological Mechanisms: {', '.join(item['biological_mechanisms'])}"""
|
| 98 |
+
|
| 99 |
+
strategy = item['Strategy']
|
| 100 |
+
for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):
|
| 101 |
+
chunk = strategy[i:i + CHUNK_SIZE]
|
| 102 |
+
documents.append(Document(
|
| 103 |
+
page_content=f"{base_content}\nStrategy Excerpt:\n{chunk}",
|
| 104 |
+
metadata={
|
| 105 |
+
"source": item["Source"],
|
| 106 |
+
"application": item["Application"],
|
| 107 |
+
"technical_concepts": item["technical_concepts"],
|
| 108 |
+
"sustainability_impacts": item["sustainability_impacts"],
|
| 109 |
+
"hyperlink": item["Hyperlink"],
|
| 110 |
+
"chunk_id": f"{item['Source']}-{len(documents)+1}"
|
| 111 |
+
}
|
| 112 |
+
))
|
| 113 |
+
|
| 114 |
+
with open(cache_file, "wb") as f:
|
| 115 |
+
pickle.dump(documents, f)
|
| 116 |
+
return documents
|
| 117 |
+
|
| 118 |
+
# --- Optimized Retrieval System ---
|
| 119 |
+
class EnhancedRetriever:
|
| 120 |
+
"""Hybrid retriever with persistent caching"""
|
| 121 |
+
def __init__(self, documents: List[Document]):
|
| 122 |
+
self.documents = documents
|
| 123 |
+
self.bm25 = self._init_bm25()
|
| 124 |
+
self.vector_store = self._init_faiss()
|
| 125 |
+
self.vector_retriever = self.vector_store.as_retriever(search_kwargs={"k": 3})
|
| 126 |
+
|
| 127 |
+
def _init_bm25(self) -> BM25Retriever:
|
| 128 |
+
cache_key = f"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}"
|
| 129 |
+
if os.path.exists(cache_key):
|
| 130 |
+
logger.info("Loading cached BM25 index")
|
| 131 |
+
with open(cache_key, "rb") as f:
|
| 132 |
+
return pickle.load(f)
|
| 133 |
+
|
| 134 |
+
logger.info("Building new BM25 index")
|
| 135 |
+
retriever = BM25Retriever.from_documents(self.documents)
|
| 136 |
+
retriever.k = 5
|
| 137 |
+
with open(cache_key, "wb") as f:
|
| 138 |
+
pickle.dump(retriever, f)
|
| 139 |
+
return retriever
|
| 140 |
+
|
| 141 |
+
def _init_faiss(self) -> FAISS:
|
| 142 |
+
cache_key = f"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}"
|
| 143 |
+
if os.path.exists(cache_key):
|
| 144 |
+
logger.info("Loading cached FAISS index")
|
| 145 |
+
return FAISS.load_local(
|
| 146 |
+
cache_key,
|
| 147 |
+
MistralEmbeddings(),
|
| 148 |
+
allow_dangerous_deserialization=True
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
logger.info("Building new FAISS index")
|
| 152 |
+
vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())
|
| 153 |
+
vector_store.save_local(cache_key)
|
| 154 |
+
return vector_store
|
| 155 |
+
|
| 156 |
+
@lru_cache(maxsize=500)
|
| 157 |
+
def retrieve(self, query: str) -> str:
|
| 158 |
+
try:
|
| 159 |
+
processed_query = self._preprocess_query(query)
|
| 160 |
+
expanded_query = self._hyde_expansion(processed_query)
|
| 161 |
+
|
| 162 |
+
bm25_results = self.bm25.invoke(processed_query)
|
| 163 |
+
vector_results = self.vector_retriever.invoke(processed_query)
|
| 164 |
+
expanded_results = self.bm25.invoke(expanded_query)
|
| 165 |
+
|
| 166 |
+
fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])
|
| 167 |
+
return self._format_context(fused_results[:5])
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error(f"Retrieval Error: {str(e)}")
|
| 170 |
+
return ""
|
| 171 |
+
|
| 172 |
+
def _preprocess_query(self, query: str) -> str:
|
| 173 |
+
return query.lower().strip()
|
| 174 |
+
|
| 175 |
+
@lru_cache(maxsize=500)
|
| 176 |
+
def _hyde_expansion(self, query: str) -> str:
|
| 177 |
+
try:
|
| 178 |
+
response = gemini_client.models.generate_content( # Use Gemini client for HyDE
|
| 179 |
+
model=generation_model,
|
| 180 |
+
contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
|
| 181 |
+
)
|
| 182 |
+
return response.text
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.error(f"HyDE Error: {str(e)}")
|
| 185 |
+
return query
|
| 186 |
+
|
| 187 |
+
def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:
|
| 188 |
+
fused_scores = defaultdict(float)
|
| 189 |
+
for docs in result_sets:
|
| 190 |
+
for rank, doc in enumerate(docs, 1):
|
| 191 |
+
fused_scores[doc.metadata["chunk_id"]] += 1 / (rank + 60)
|
| 192 |
+
|
| 193 |
+
seen = set()
|
| 194 |
+
return [
|
| 195 |
+
doc for doc in sorted(
|
| 196 |
+
(doc for docs in result_sets for doc in docs),
|
| 197 |
+
key=lambda x: fused_scores[x.metadata["chunk_id"]],
|
| 198 |
+
reverse=True
|
| 199 |
+
) if not (doc.metadata["chunk_id"] in seen or seen.add(doc.metadata["chunk_id"]))
|
| 200 |
+
]
|
| 201 |
+
|
| 202 |
+
def _format_context(self, docs: List[Document]) -> str:
|
| 203 |
+
context = []
|
| 204 |
+
for doc in docs:
|
| 205 |
+
context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
|
| 206 |
+
**Application**: {doc.metadata['application']}
|
| 207 |
+
**Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
|
| 208 |
+
**Strategy Excerpt**:\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
|
| 209 |
+
context.append(context_str)
|
| 210 |
+
return "\n\n---\n\n".join(context)
|
| 211 |
+
|
| 212 |
+
# --- Generation System ---
|
| 213 |
+
SYSTEM_PROMPT = """**Biomimicry Expert Guidelines**
|
| 214 |
+
1. Base answers strictly on context
|
| 215 |
+
2. **Bold** technical terms
|
| 216 |
+
3. Include reference links at the end of the response
|
| 217 |
+
|
| 218 |
+
Context: {context}"""
|
| 219 |
+
|
| 220 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
|
| 221 |
+
def get_ai_response(query: str, context: str) -> str:
|
| 222 |
+
try:
|
| 223 |
+
response = gemini_client.models.generate_content( # Use Gemini client for generation
|
| 224 |
+
model=generation_model,
|
| 225 |
+
contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
|
| 226 |
+
)
|
| 227 |
+
logger.info(f"Raw Response: {response.text}") # Log raw response
|
| 228 |
+
return _postprocess_response(response.text)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"Generation Error: {str(e)}")
|
| 231 |
+
return "I'm unable to generate a response right now. Please try again later."
|
| 232 |
+
|
| 233 |
+
def _postprocess_response(response: str) -> str:
|
| 234 |
+
response = re.sub(r"\[(.*?)\]", r"[\1](#)", response)
|
| 235 |
+
response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
|
| 236 |
+
return response
|
| 237 |
+
|
| 238 |
+
# --- Optimized Pipeline ---
|
| 239 |
+
documents = load_and_chunk_data(data_file_name)
|
| 240 |
+
retriever = EnhancedRetriever(documents)
|
| 241 |
+
|
| 242 |
+
def generate_response(question: str) -> str:
|
| 243 |
+
try:
|
| 244 |
+
context = retriever.retrieve(question)
|
| 245 |
+
return get_ai_response(question, context) if context else "No relevant information found."
|
| 246 |
+
except Exception as e:
|
| 247 |
+
logger.error(f"Pipeline Error: {str(e)}")
|
| 248 |
+
return "An error occurred processing your request."
|
| 249 |
+
|
| 250 |
+
# --- Gradio Interface ---
|
| 251 |
+
def chat_interface(question: str, history: List[Tuple[str, str]]):
|
| 252 |
+
response = generate_response(question)
|
| 253 |
+
return "", history + [(question, response)]
|
| 254 |
+
|
| 255 |
+
with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
|
| 256 |
+
gr.Markdown("# 🌿 AskNature RAG-based Chatbot ")
|
| 257 |
+
with gr.Row():
|
| 258 |
+
chatbot = gr.Chatbot(label="Dialogue History", height=500)
|
| 259 |
+
with gr.Row():
|
| 260 |
+
question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')",
|
| 261 |
+
label="Inquiry", scale=4)
|
| 262 |
+
clear_btn = gr.Button("Clear History", variant="secondary")
|
| 263 |
+
|
| 264 |
+
gr.Markdown("""
|
| 265 |
+
<div style="text-align: center; color: #4a7c59;">
|
| 266 |
+
<small>Powered by AskNature's Database |
|
| 267 |
+
Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
|
| 268 |
+
</div>""")
|
| 269 |
+
question.submit(chat_interface, [question, chatbot], [question, chatbot])
|
| 270 |
+
clear_btn.click(lambda: [], None, chatbot)
|
| 271 |
+
|
| 272 |
+
if __name__ == "__main__":
|
| 273 |
+
=======
|
| 274 |
+
# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation
|
| 275 |
+
|
| 276 |
+
import json
|
| 277 |
+
import logging
|
| 278 |
+
import re
|
| 279 |
+
import os
|
| 280 |
+
import pickle
|
| 281 |
+
from typing import List, Tuple, Optional
|
| 282 |
+
import gradio as gr
|
| 283 |
+
from openai import OpenAI
|
| 284 |
+
from google import genai
|
| 285 |
+
from functools import lru_cache
|
| 286 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 287 |
+
from langchain_community.retrievers import BM25Retriever
|
| 288 |
+
from langchain_community.vectorstores import FAISS
|
| 289 |
+
from langchain_core.embeddings import Embeddings
|
| 290 |
+
from langchain_core.documents import Document
|
| 291 |
+
from collections import defaultdict
|
| 292 |
+
import hashlib
|
| 293 |
+
from tqdm import tqdm
|
| 294 |
+
|
| 295 |
+
from dotenv import load_dotenv
|
| 296 |
+
load_dotenv()
|
| 297 |
+
# --- Configuration ---
|
| 298 |
+
FAISS_INDEX_PATH = "faiss_index"
|
| 299 |
+
BM25_INDEX_PATH = "bm25_index.pkl"
|
| 300 |
+
CACHE_VERSION = "v1" # Increment when data format changes
|
| 301 |
+
embedding_model = "e5-mistral-7b-instruct" # OpenAI embedding model
|
| 302 |
+
generation_model = "gemini-2.0-flash" # Gemini generation model
|
| 303 |
+
data_file_name = "AskNatureNet_data_enhanced.json"
|
| 304 |
+
API_CONFIG = {
|
| 305 |
+
"gemini_api_key": os.getenv("GEMINI_API_KEY") # Gemini API key for generation
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
CHUNK_SIZE = 800
|
| 309 |
+
OVERLAP = 200
|
| 310 |
+
EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls
|
| 311 |
+
|
| 312 |
+
# Initialize clients
|
| 313 |
+
OPENAI_API_CONFIG = {
|
| 314 |
+
"api_key": os.getenv("OPENAI_API_KEY"),
|
| 315 |
+
"base_url": "https://chat-ai.academiccloud.de/v1"
|
| 316 |
+
}
|
| 317 |
+
client = OpenAI(**OPENAI_API_CONFIG)
|
| 318 |
+
gemini_client = genai.Client(api_key=API_CONFIG["gemini_api_key"]) # Gemini client for generation
|
| 319 |
+
logging.basicConfig(level=logging.INFO)
|
| 320 |
+
logger = logging.getLogger(__name__)
|
| 321 |
+
|
| 322 |
+
# --- Helper Functions ---
|
| 323 |
+
def get_data_hash(file_path: str) -> str:
|
| 324 |
+
"""Generate hash of data file for cache validation"""
|
| 325 |
+
with open(file_path, "rb") as f:
|
| 326 |
+
return hashlib.md5(f.read()).hexdigest()
|
| 327 |
+
|
| 328 |
+
# --- Custom Embedding Handler with Progress Tracking ---
|
| 329 |
+
class MistralEmbeddings(Embeddings):
|
| 330 |
+
"""E5-Mistral-7B embedding adapter with error handling and progress tracking"""
|
| 331 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 332 |
+
embeddings = []
|
| 333 |
+
try:
|
| 334 |
+
# Process in batches with progress tracking
|
| 335 |
+
for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
|
| 336 |
+
batch = texts[i:i + EMBEDDING_BATCH_SIZE]
|
| 337 |
+
response = client.embeddings.create(
|
| 338 |
+
input=batch,
|
| 339 |
+
model=embedding_model,
|
| 340 |
+
encoding_format="float"
|
| 341 |
+
)
|
| 342 |
+
embeddings.extend([e.embedding for e in response.data])
|
| 343 |
+
return embeddings
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.error(f"Embedding Error: {str(e)}")
|
| 346 |
+
return [[] for _ in texts]
|
| 347 |
+
|
| 348 |
+
def embed_query(self, text: str) -> List[float]:
|
| 349 |
+
return self.embed_documents([text])[0]
|
| 350 |
+
|
| 351 |
+
# --- Data Processing with Cache Validation ---
|
| 352 |
+
def load_and_chunk_data(file_path: str) -> List[Document]:
|
| 353 |
+
"""Enhanced chunking with metadata preservation"""
|
| 354 |
+
current_hash = get_data_hash(file_path)
|
| 355 |
+
cache_file = f"documents_{CACHE_VERSION}_{current_hash}.pkl"
|
| 356 |
+
|
| 357 |
+
if os.path.exists(cache_file):
|
| 358 |
+
logger.info("Loading cached documents")
|
| 359 |
+
with open(cache_file, "rb") as f:
|
| 360 |
+
return pickle.load(f)
|
| 361 |
+
|
| 362 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 363 |
+
data = json.load(f)
|
| 364 |
+
|
| 365 |
+
documents = []
|
| 366 |
+
for item in tqdm(data, desc="Chunking Progress"):
|
| 367 |
+
base_content = f"""Source: {item['Source']}
|
| 368 |
+
Application: {item['Application']}
|
| 369 |
+
Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}
|
| 370 |
+
Technical Concepts: {', '.join(item['technical_concepts'])}
|
| 371 |
+
Biological Mechanisms: {', '.join(item['biological_mechanisms'])}"""
|
| 372 |
+
|
| 373 |
+
strategy = item['Strategy']
|
| 374 |
+
for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):
|
| 375 |
+
chunk = strategy[i:i + CHUNK_SIZE]
|
| 376 |
+
documents.append(Document(
|
| 377 |
+
page_content=f"{base_content}\nStrategy Excerpt:\n{chunk}",
|
| 378 |
+
metadata={
|
| 379 |
+
"source": item["Source"],
|
| 380 |
+
"application": item["Application"],
|
| 381 |
+
"technical_concepts": item["technical_concepts"],
|
| 382 |
+
"sustainability_impacts": item["sustainability_impacts"],
|
| 383 |
+
"hyperlink": item["Hyperlink"],
|
| 384 |
+
"chunk_id": f"{item['Source']}-{len(documents)+1}"
|
| 385 |
+
}
|
| 386 |
+
))
|
| 387 |
+
|
| 388 |
+
with open(cache_file, "wb") as f:
|
| 389 |
+
pickle.dump(documents, f)
|
| 390 |
+
return documents
|
| 391 |
+
|
| 392 |
+
# --- Optimized Retrieval System ---
|
| 393 |
+
class EnhancedRetriever:
|
| 394 |
+
"""Hybrid retriever with persistent caching"""
|
| 395 |
+
def __init__(self, documents: List[Document]):
|
| 396 |
+
self.documents = documents
|
| 397 |
+
self.bm25 = self._init_bm25()
|
| 398 |
+
self.vector_store = self._init_faiss()
|
| 399 |
+
self.vector_retriever = self.vector_store.as_retriever(search_kwargs={"k": 3})
|
| 400 |
+
|
| 401 |
+
def _init_bm25(self) -> BM25Retriever:
|
| 402 |
+
cache_key = f"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}"
|
| 403 |
+
if os.path.exists(cache_key):
|
| 404 |
+
logger.info("Loading cached BM25 index")
|
| 405 |
+
with open(cache_key, "rb") as f:
|
| 406 |
+
return pickle.load(f)
|
| 407 |
+
|
| 408 |
+
logger.info("Building new BM25 index")
|
| 409 |
+
retriever = BM25Retriever.from_documents(self.documents)
|
| 410 |
+
retriever.k = 5
|
| 411 |
+
with open(cache_key, "wb") as f:
|
| 412 |
+
pickle.dump(retriever, f)
|
| 413 |
+
return retriever
|
| 414 |
+
|
| 415 |
+
def _init_faiss(self) -> FAISS:
|
| 416 |
+
cache_key = f"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}"
|
| 417 |
+
if os.path.exists(cache_key):
|
| 418 |
+
logger.info("Loading cached FAISS index")
|
| 419 |
+
return FAISS.load_local(
|
| 420 |
+
cache_key,
|
| 421 |
+
MistralEmbeddings(),
|
| 422 |
+
allow_dangerous_deserialization=True
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
logger.info("Building new FAISS index")
|
| 426 |
+
vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())
|
| 427 |
+
vector_store.save_local(cache_key)
|
| 428 |
+
return vector_store
|
| 429 |
+
|
| 430 |
+
@lru_cache(maxsize=500)
|
| 431 |
+
def retrieve(self, query: str) -> str:
|
| 432 |
+
try:
|
| 433 |
+
processed_query = self._preprocess_query(query)
|
| 434 |
+
expanded_query = self._hyde_expansion(processed_query)
|
| 435 |
+
|
| 436 |
+
bm25_results = self.bm25.invoke(processed_query)
|
| 437 |
+
vector_results = self.vector_retriever.invoke(processed_query)
|
| 438 |
+
expanded_results = self.bm25.invoke(expanded_query)
|
| 439 |
+
|
| 440 |
+
fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])
|
| 441 |
+
return self._format_context(fused_results[:5])
|
| 442 |
+
except Exception as e:
|
| 443 |
+
logger.error(f"Retrieval Error: {str(e)}")
|
| 444 |
+
return ""
|
| 445 |
+
|
| 446 |
+
def _preprocess_query(self, query: str) -> str:
|
| 447 |
+
return query.lower().strip()
|
| 448 |
+
|
| 449 |
+
@lru_cache(maxsize=500)
|
| 450 |
+
def _hyde_expansion(self, query: str) -> str:
|
| 451 |
+
try:
|
| 452 |
+
response = gemini_client.models.generate_content( # Use Gemini client for HyDE
|
| 453 |
+
model=generation_model,
|
| 454 |
+
contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
|
| 455 |
+
)
|
| 456 |
+
return response.text
|
| 457 |
+
except Exception as e:
|
| 458 |
+
logger.error(f"HyDE Error: {str(e)}")
|
| 459 |
+
return query
|
| 460 |
+
|
| 461 |
+
def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:
|
| 462 |
+
fused_scores = defaultdict(float)
|
| 463 |
+
for docs in result_sets:
|
| 464 |
+
for rank, doc in enumerate(docs, 1):
|
| 465 |
+
fused_scores[doc.metadata["chunk_id"]] += 1 / (rank + 60)
|
| 466 |
+
|
| 467 |
+
seen = set()
|
| 468 |
+
return [
|
| 469 |
+
doc for doc in sorted(
|
| 470 |
+
(doc for docs in result_sets for doc in docs),
|
| 471 |
+
key=lambda x: fused_scores[x.metadata["chunk_id"]],
|
| 472 |
+
reverse=True
|
| 473 |
+
) if not (doc.metadata["chunk_id"] in seen or seen.add(doc.metadata["chunk_id"]))
|
| 474 |
+
]
|
| 475 |
+
|
| 476 |
+
def _format_context(self, docs: List[Document]) -> str:
|
| 477 |
+
context = []
|
| 478 |
+
for doc in docs:
|
| 479 |
+
context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
|
| 480 |
+
**Application**: {doc.metadata['application']}
|
| 481 |
+
**Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
|
| 482 |
+
**Strategy Excerpt**:\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
|
| 483 |
+
context.append(context_str)
|
| 484 |
+
return "\n\n---\n\n".join(context)
|
| 485 |
+
|
| 486 |
+
# --- Generation System ---
|
| 487 |
+
SYSTEM_PROMPT = """**Biomimicry Expert Guidelines**
|
| 488 |
+
1. Base answers strictly on context
|
| 489 |
+
2. **Bold** technical terms
|
| 490 |
+
3. Include reference links at the end of the response
|
| 491 |
+
|
| 492 |
+
Context: {context}"""
|
| 493 |
+
|
| 494 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
|
| 495 |
+
def get_ai_response(query: str, context: str) -> str:
|
| 496 |
+
try:
|
| 497 |
+
response = gemini_client.models.generate_content( # Use Gemini client for generation
|
| 498 |
+
model=generation_model,
|
| 499 |
+
contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
|
| 500 |
+
)
|
| 501 |
+
logger.info(f"Raw Response: {response.text}") # Log raw response
|
| 502 |
+
return _postprocess_response(response.text)
|
| 503 |
+
except Exception as e:
|
| 504 |
+
logger.error(f"Generation Error: {str(e)}")
|
| 505 |
+
return "I'm unable to generate a response right now. Please try again later."
|
| 506 |
+
|
| 507 |
+
def _postprocess_response(response: str) -> str:
|
| 508 |
+
response = re.sub(r"\[(.*?)\]", r"[\1](#)", response)
|
| 509 |
+
response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
|
| 510 |
+
return response
|
| 511 |
+
|
| 512 |
+
# --- Optimized Pipeline ---
|
| 513 |
+
documents = load_and_chunk_data(data_file_name)
|
| 514 |
+
retriever = EnhancedRetriever(documents)
|
| 515 |
+
|
| 516 |
+
def generate_response(question: str) -> str:
|
| 517 |
+
try:
|
| 518 |
+
context = retriever.retrieve(question)
|
| 519 |
+
return get_ai_response(question, context) if context else "No relevant information found."
|
| 520 |
+
except Exception as e:
|
| 521 |
+
logger.error(f"Pipeline Error: {str(e)}")
|
| 522 |
+
return "An error occurred processing your request."
|
| 523 |
+
|
| 524 |
+
# --- Gradio Interface ---
|
| 525 |
+
def chat_interface(question: str, history: List[Tuple[str, str]]):
|
| 526 |
+
response = generate_response(question)
|
| 527 |
+
return "", history + [(question, response)]
|
| 528 |
+
|
| 529 |
+
with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
|
| 530 |
+
gr.Markdown("# 🌿 AskNature RAG-based Chatbot ")
|
| 531 |
+
with gr.Row():
|
| 532 |
+
chatbot = gr.Chatbot(label="Dialogue History", height=500)
|
| 533 |
+
with gr.Row():
|
| 534 |
+
question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')",
|
| 535 |
+
label="Inquiry", scale=4)
|
| 536 |
+
clear_btn = gr.Button("Clear History", variant="secondary")
|
| 537 |
+
|
| 538 |
+
gr.Markdown("""
|
| 539 |
+
<div style="text-align: center; color: #4a7c59;">
|
| 540 |
+
<small>Powered by AskNature's Database |
|
| 541 |
+
Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
|
| 542 |
+
</div>""")
|
| 543 |
+
question.submit(chat_interface, [question, chatbot], [question, chatbot])
|
| 544 |
+
clear_btn.click(lambda: [], None, chatbot)
|
| 545 |
+
|
| 546 |
+
if __name__ == "__main__":
|
| 547 |
+
>>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
|
| 548 |
+
demo.launch(show_error=True)
|
bm25_index.pkl_5c0c37d3cbc20e235eeec7cffd2d312f
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61d29d4cd2651f2f356e67f24dafbb804293116be434bef7ec4f43b2f5afa456
|
| 3 |
+
size 13737932
|
documents_v1_5c0c37d3cbc20e235eeec7cffd2d312f.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abddd30a2c4716bc6b06e7db60860017cd80838633bfa437dfa16f8d0e322817
|
| 3 |
+
size 6358288
|
main.ipynb
ADDED
|
@@ -0,0 +1,1250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"# Approach 1: Local Llama2 via Ollama\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"questions = [\n",
|
| 12 |
+
" \"How do coral proteins help make eco-friendly fabrics without dyes?\",\n",
|
| 13 |
+
" \"What environmental problems do coral-inspired textiles solve?\",\n",
|
| 14 |
+
" \"What is industrial symbiosis and how does the Kalundborg example work?\",\n",
|
| 15 |
+
" \"How do Metavision sensors work like human eyes to save energy?\",\n",
|
| 16 |
+
" \"How does TISSIUM copy skin proteins for medical adhesives?\",\n",
|
| 17 |
+
" \"How does DNA-level design create better fibers inspired by nature?\",\n",
|
| 18 |
+
" \"Why is industrial symbiosis hard to implement despite benefits?\",\n",
|
| 19 |
+
" \"How can biological systems inspire sustainable manufacturing?\",\n",
|
| 20 |
+
" \"What other industries can use protein-based materials like Werewool?\",\n",
|
| 21 |
+
" \"How could event-based cameras improve security systems?\",\n",
|
| 22 |
+
" \"Design a factory network that works like coral reef partnerships - what features would it need?\"\n",
|
| 23 |
+
"]\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"import json\n",
|
| 27 |
+
"import pandas as pd\n",
|
| 28 |
+
"from langchain_ollama import OllamaLLM, OllamaEmbeddings\n",
|
| 29 |
+
"from langchain_community.vectorstores import FAISS\n",
|
| 30 |
+
"from langchain_core.prompts import PromptTemplate\n",
|
| 31 |
+
"from langchain_core.output_parsers import StrOutputParser\n",
|
| 32 |
+
"from operator import itemgetter\n",
|
| 33 |
+
"import gradio as gr\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"# Load and process data\n",
|
| 36 |
+
"with open('mini_data.json', 'r', encoding='utf-8') as f:\n",
|
| 37 |
+
" data = json.load(f)\n",
|
| 38 |
+
"documents = [f\"Source: {item['Source']}\\nApplication: {item['Application']}\\nFunction1: {item['Function1']}\\nStrategy: {item['Strategy']}\" for item in data]\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"# Local Llama2 setup\n",
|
| 41 |
+
"local_model = OllamaLLM(model=\"llama2\")\n",
|
| 42 |
+
"local_embeddings = OllamaEmbeddings(model=\"llama2\")\n",
|
| 43 |
+
"vectorstore = FAISS.from_texts(documents, local_embeddings)\n",
|
| 44 |
+
"retriever = vectorstore.as_retriever()\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"# RAG pipeline\n",
|
| 47 |
+
"template = \"\"\"Answer the question based on the context below. If unsure, reply \"I don't know\".\n",
|
| 48 |
+
"Context: {context}\n",
|
| 49 |
+
"Question: {question}\"\"\"\n",
|
| 50 |
+
"prompt = PromptTemplate.from_template(template)\n",
|
| 51 |
+
"local_chain = ({\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")} \n",
|
| 52 |
+
" | prompt | local_model | StrOutputParser())\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"# Chat interface\n",
|
| 55 |
+
"def local_rag(question, history):\n",
|
| 56 |
+
" response = local_chain.invoke({\"question\": question})\n",
|
| 57 |
+
" history.append((question, response))\n",
|
| 58 |
+
" return \"\", history\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"with gr.Blocks() as local_demo:\n",
|
| 61 |
+
" gr.Markdown(\"# Local Llama2 RAG Chatbot\")\n",
|
| 62 |
+
" chatbot = gr.Chatbot()\n",
|
| 63 |
+
" question = gr.Textbox(label=\"Ask about biomimicry:\")\n",
|
| 64 |
+
" question.submit(local_rag, [question, chatbot], [question, chatbot])\n",
|
| 65 |
+
" \n",
|
| 66 |
+
"local_demo.launch()"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": null,
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [],
|
| 74 |
+
"source": [
|
| 75 |
+
"# Approach 2: Llama3.3 via API\n",
|
| 76 |
+
"import json\n",
|
| 77 |
+
"import gradio as gr\n",
|
| 78 |
+
"from openai import OpenAI\n",
|
| 79 |
+
"from operator import itemgetter\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"# API configuration\n",
|
| 82 |
+
"api_key = 'd9960fad1d2aaa16167902b0d26e369f'\n",
|
| 83 |
+
"base_url = \"https://chat-ai.academiccloud.de/v1\"\n",
|
| 84 |
+
"model = \"llama-3.3-70b-instruct\"\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"# Initialize OpenAI client\n",
|
| 87 |
+
"client = OpenAI(api_key=api_key, base_url=base_url)\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"# Load and process data\n",
|
| 90 |
+
"with open('mini_data.json', 'r', encoding='utf-8') as f:\n",
|
| 91 |
+
" data = json.load(f)\n",
|
| 92 |
+
"documents = [f\"Source: {item['Source']}\\nApplication: {item['Application']}\\nFunction1: {item['Function1']}\\nStrategy: {item['Strategy']}\" for item in data]\n",
|
| 93 |
+
"\n",
|
| 94 |
+
"def retrieve_context(question):\n",
|
| 95 |
+
" \"\"\"Simple keyword-based retrieval since embeddings aren't available\"\"\"\n",
|
| 96 |
+
" keywords = set(question.lower().split())\n",
|
| 97 |
+
" relevant = []\n",
|
| 98 |
+
" for doc in documents:\n",
|
| 99 |
+
" if any(keyword in doc.lower() for keyword in keywords):\n",
|
| 100 |
+
" relevant.append(doc)\n",
|
| 101 |
+
" return \"\\n\\n\".join(relevant[:3]) # Return top 3 matches\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"def generate_response(question):\n",
|
| 104 |
+
" context = retrieve_context(question)\n",
|
| 105 |
+
" response = client.chat.completions.create(\n",
|
| 106 |
+
" messages=[\n",
|
| 107 |
+
" {\"role\": \"system\", \"content\": f\"Answer based on context. If unsure, say 'I don't know'.\\nContext: {context}\"},\n",
|
| 108 |
+
" {\"role\": \"user\", \"content\": question}\n",
|
| 109 |
+
" ],\n",
|
| 110 |
+
" model=model\n",
|
| 111 |
+
" )\n",
|
| 112 |
+
" return response.choices[0].message.content\n",
|
| 113 |
+
"\n",
|
| 114 |
+
"# Chat interface\n",
|
| 115 |
+
"def cloud_rag(question, history):\n",
|
| 116 |
+
" response = generate_response(question)\n",
|
| 117 |
+
" history.append((question, response))\n",
|
| 118 |
+
" return \"\", history\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"with gr.Blocks() as demo:\n",
|
| 121 |
+
" gr.Markdown(\"# AskNature RAG-based Chatbot\")\n",
|
| 122 |
+
" chatbot = gr.Chatbot()\n",
|
| 123 |
+
" question = gr.Textbox(label=\"Ask about biomimicry:\")\n",
|
| 124 |
+
" question.submit(cloud_rag, [question, chatbot], [question, chatbot])\n",
|
| 125 |
+
" \n",
|
| 126 |
+
"demo.launch()"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "code",
|
| 131 |
+
"execution_count": null,
|
| 132 |
+
"metadata": {},
|
| 133 |
+
"outputs": [],
|
| 134 |
+
"source": []
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"cell_type": "code",
|
| 138 |
+
"execution_count": null,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"outputs": [],
|
| 141 |
+
"source": [
|
| 142 |
+
"# Enhanced Metadata Generation with Rate Control and Incremental Processing\n",
|
| 143 |
+
"import json\n",
|
| 144 |
+
"import time\n",
|
| 145 |
+
"import random\n",
|
| 146 |
+
"from typing import Dict, List\n",
|
| 147 |
+
"from openai import OpenAI\n",
|
| 148 |
+
"from tenacity import retry, stop_after_attempt, wait_random_exponential\n",
|
| 149 |
+
"import os\n",
|
| 150 |
+
"\n",
|
| 151 |
+
"# Initialize OpenAI client\n",
|
| 152 |
+
"client = OpenAI(\n",
|
| 153 |
+
" api_key= 'd9960fad1d2aaa16167902b0d26e369f', # 'd1c9ed1ca70b9721dee1087d93f9662a',\n",
|
| 154 |
+
" base_url=\"https://chat-ai.academiccloud.de/v1\"\n",
|
| 155 |
+
")\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(min=2, max=60))\n",
|
| 158 |
+
"def generate_metadata_tags(strategy_text: str) -> Dict:\n",
|
| 159 |
+
" \"\"\"Generate structured metadata with enhanced error handling\"\"\"\n",
|
| 160 |
+
" system_prompt = \"\"\"Analyze the technical text and generate structured metadata:\n",
|
| 161 |
+
"1. **Technical Concepts** (array, max 5 items): Specific technical terms/methods\n",
|
| 162 |
+
"2. **Biological Mechanisms** (array, max 3): Biological processes observed in nature\n",
|
| 163 |
+
"3. **Industry Applications** (array, max 3): Practical commercial uses\n",
|
| 164 |
+
"4. **Sustainability Impacts** (array, max 2): Environmental benefits\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"Example Response:\n",
|
| 167 |
+
"{\n",
|
| 168 |
+
" \"technical_concepts\": [\"protein-based pigmentation\", \"DNA-level fiber design\"],\n",
|
| 169 |
+
" \"biological_mechanisms\": [\"coral-algae symbiosis\"],\n",
|
| 170 |
+
" \"industry_applications\": [\"textile manufacturing\"],\n",
|
| 171 |
+
" \"sustainability_impacts\": [\"reduces chemical waste\"]\n",
|
| 172 |
+
"}\"\"\"\n",
|
| 173 |
+
"\n",
|
| 174 |
+
" response = client.chat.completions.create(\n",
|
| 175 |
+
" messages=[\n",
|
| 176 |
+
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
| 177 |
+
" {\"role\": \"user\", \"content\": strategy_text}\n",
|
| 178 |
+
" ],\n",
|
| 179 |
+
" model=\"llama-3.3-70b-instruct\",\n",
|
| 180 |
+
" temperature=0.1,\n",
|
| 181 |
+
" response_format={\"type\": \"json_object\"}\n",
|
| 182 |
+
" )\n",
|
| 183 |
+
" \n",
|
| 184 |
+
" return validate_metadata(json.loads(response.choices[0].message.content))\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"def validate_metadata(metadata: Dict) -> Dict:\n",
|
| 187 |
+
" \"\"\"Ensure metadata structure quality\"\"\"\n",
|
| 188 |
+
" required_keys = {\n",
|
| 189 |
+
" \"technical_concepts\": list,\n",
|
| 190 |
+
" \"biological_mechanisms\": list,\n",
|
| 191 |
+
" \"industry_applications\": list,\n",
|
| 192 |
+
" \"sustainability_impacts\": list\n",
|
| 193 |
+
" }\n",
|
| 194 |
+
" \n",
|
| 195 |
+
" for key, type_ in required_keys.items():\n",
|
| 196 |
+
" if key not in metadata or not isinstance(metadata[key], type_):\n",
|
| 197 |
+
" raise ValueError(f\"Invalid metadata format for {key}\")\n",
|
| 198 |
+
" \n",
|
| 199 |
+
" return metadata\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"def enhance_dataset(input_file: str, output_file: str):\n",
|
| 202 |
+
" \"\"\"Robust incremental metadata enhancement with rate control\"\"\"\n",
|
| 203 |
+
" # Load existing enhanced data\n",
|
| 204 |
+
" existing_data = []\n",
|
| 205 |
+
" existing_hyperlinks = set()\n",
|
| 206 |
+
" \n",
|
| 207 |
+
" if os.path.exists(output_file):\n",
|
| 208 |
+
" with open(output_file, 'r') as f:\n",
|
| 209 |
+
" existing_data = json.load(f)\n",
|
| 210 |
+
" existing_hyperlinks = {item[\"Hyperlink\"] for item in existing_data if \"Hyperlink\" in item}\n",
|
| 211 |
+
" \n",
|
| 212 |
+
" # Load input data and filter unprocessed items\n",
|
| 213 |
+
" with open(input_file, 'r') as f:\n",
|
| 214 |
+
" input_data = json.load(f)\n",
|
| 215 |
+
" \n",
|
| 216 |
+
" new_items = [item for item in input_data if item.get(\"Hyperlink\") not in existing_hyperlinks]\n",
|
| 217 |
+
" \n",
|
| 218 |
+
" if not new_items:\n",
|
| 219 |
+
" print(\"All items already processed in the enhanced file.\")\n",
|
| 220 |
+
" return\n",
|
| 221 |
+
" else:\n",
|
| 222 |
+
" output_length = len(existing_data)\n",
|
| 223 |
+
" input_length = len(input_data)\n",
|
| 224 |
+
" print(f\"Processing {len(new_items)} new items... out of {input_length} total\")\n",
|
| 225 |
+
" \n",
|
| 226 |
+
" results = existing_data.copy()\n",
|
| 227 |
+
" error_count = 0\n",
|
| 228 |
+
" total_items = len(new_items)\n",
|
| 229 |
+
" \n",
|
| 230 |
+
" for idx, item in enumerate(new_items):\n",
|
| 231 |
+
" try:\n",
|
| 232 |
+
" # Enhanced rate control with progressive backoff\n",
|
| 233 |
+
" if idx > 0:\n",
|
| 234 |
+
" base_delay = min(5 + (idx // 10), 30) # Progressive delay up to 30s\n",
|
| 235 |
+
" delay = random.uniform(base_delay, base_delay + 5)\n",
|
| 236 |
+
" time.sleep(delay)\n",
|
| 237 |
+
" \n",
|
| 238 |
+
" # Process item\n",
|
| 239 |
+
" metadata = generate_metadata_tags(item[\"Strategy\"])\n",
|
| 240 |
+
" enhanced_item = {**item, **metadata}\n",
|
| 241 |
+
" results.append(enhanced_item)\n",
|
| 242 |
+
" \n",
|
| 243 |
+
" # Checkpoint saving\n",
|
| 244 |
+
" if (idx + 1) % 5 == 0 or (idx + 1) == total_items:\n",
|
| 245 |
+
" with open(output_file, 'w') as f:\n",
|
| 246 |
+
" json.dump(results, f, indent=2)\n",
|
| 247 |
+
" print(f\"Progress: {idx+1+output_length}/{input_length} items processed\")\n",
|
| 248 |
+
" \n",
|
| 249 |
+
" except Exception as e:\n",
|
| 250 |
+
" error_count += 1\n",
|
| 251 |
+
" print(f\"Error processing {item.get('Source', 'Unknown')}: {str(e)}\")\n",
|
| 252 |
+
" # results.append(item) # Preserve original data\n",
|
| 253 |
+
" \n",
|
| 254 |
+
" print(f\"Processing complete. Success rate: {total_items-error_count}/{input_length}\")\n",
|
| 255 |
+
"\n",
|
| 256 |
+
"# Execute enhancement\n",
|
| 257 |
+
"enhance_dataset(\"AskNatureNet_data.json\", \"AskNatureNet_data_enhanced.json\")"
|
| 258 |
+
]
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"cell_type": "code",
|
| 262 |
+
"execution_count": null,
|
| 263 |
+
"metadata": {},
|
| 264 |
+
"outputs": [],
|
| 265 |
+
"source": [
|
| 266 |
+
"# Optimized RAG System with E5-Mistral Embeddings and Llama3-70B Generation\n",
|
| 267 |
+
" \n",
|
| 268 |
+
"import json\n",
|
| 269 |
+
"import logging\n",
|
| 270 |
+
"import re\n",
|
| 271 |
+
"import os\n",
|
| 272 |
+
"import pickle\n",
|
| 273 |
+
"from typing import List, Tuple, Optional\n",
|
| 274 |
+
"import gradio as gr\n",
|
| 275 |
+
"from openai import OpenAI\n",
|
| 276 |
+
"from functools import lru_cache\n",
|
| 277 |
+
"from tenacity import retry, stop_after_attempt, wait_exponential\n",
|
| 278 |
+
"from langchain_community.retrievers import BM25Retriever\n",
|
| 279 |
+
"from langchain_community.vectorstores import FAISS\n",
|
| 280 |
+
"from langchain_core.embeddings import Embeddings\n",
|
| 281 |
+
"from langchain_core.documents import Document\n",
|
| 282 |
+
"from collections import defaultdict\n",
|
| 283 |
+
"import hashlib\n",
|
| 284 |
+
"from tqdm import tqdm # For progress tracking\n",
|
| 285 |
+
"from dotenv import load_dotenv\n",
|
| 286 |
+
"load_dotenv()\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"# --- Configuration ---\n",
|
| 289 |
+
"FAISS_INDEX_PATH = \"faiss_index\"\n",
|
| 290 |
+
"BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
|
| 291 |
+
"CACHE_VERSION = \"v1\" # Increment when data format changes\n",
|
| 292 |
+
"embedding_model = \"e5-mistral-7b-instruct\"\n",
|
| 293 |
+
"generation_model = \"meta-llama-3-70b-instruct\"\n",
|
| 294 |
+
"data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
|
| 295 |
+
"API_CONFIG = {\n",
|
| 296 |
+
" \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
|
| 297 |
+
" \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
|
| 298 |
+
"}\n",
|
| 299 |
+
"CHUNK_SIZE = 800\n",
|
| 300 |
+
"OVERLAP = 200\n",
|
| 301 |
+
"EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls\n",
|
| 302 |
+
"\n",
|
| 303 |
+
"# Initialize clients\n",
|
| 304 |
+
"client = OpenAI(**API_CONFIG)\n",
|
| 305 |
+
"logging.basicConfig(level=logging.INFO)\n",
|
| 306 |
+
"logger = logging.getLogger(__name__)\n",
|
| 307 |
+
"\n",
|
| 308 |
+
"# --- Helper Functions ---\n",
|
| 309 |
+
"def get_data_hash(file_path: str) -> str:\n",
|
| 310 |
+
" \"\"\"Generate hash of data file for cache validation\"\"\"\n",
|
| 311 |
+
" with open(file_path, \"rb\") as f:\n",
|
| 312 |
+
" return hashlib.md5(f.read()).hexdigest()\n",
|
| 313 |
+
"\n",
|
| 314 |
+
"# --- Custom Embedding Handler with Progress Tracking ---\n",
|
| 315 |
+
"class MistralEmbeddings(Embeddings):\n",
|
| 316 |
+
" \"\"\"E5-Mistral-7B embedding adapter with error handling and progress tracking\"\"\"\n",
|
| 317 |
+
" def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
|
| 318 |
+
" embeddings = []\n",
|
| 319 |
+
" try:\n",
|
| 320 |
+
" # Process in batches with progress tracking\n",
|
| 321 |
+
" for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
|
| 322 |
+
" batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
|
| 323 |
+
" response = client.embeddings.create(\n",
|
| 324 |
+
" input=batch,\n",
|
| 325 |
+
" model=embedding_model,\n",
|
| 326 |
+
" encoding_format=\"float\"\n",
|
| 327 |
+
" )\n",
|
| 328 |
+
" embeddings.extend([e.embedding for e in response.data])\n",
|
| 329 |
+
" return embeddings\n",
|
| 330 |
+
" except Exception as e:\n",
|
| 331 |
+
" logger.error(f\"Embedding Error: {str(e)}\")\n",
|
| 332 |
+
" return [[] for _ in texts]\n",
|
| 333 |
+
"\n",
|
| 334 |
+
" def embed_query(self, text: str) -> List[float]:\n",
|
| 335 |
+
" return self.embed_documents([text])[0]\n",
|
| 336 |
+
"\n",
|
| 337 |
+
"# --- Data Processing with Cache Validation ---\n",
|
| 338 |
+
"def load_and_chunk_data(file_path: str) -> List[Document]:\n",
|
| 339 |
+
" \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
|
| 340 |
+
" current_hash = get_data_hash(file_path)\n",
|
| 341 |
+
" cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
|
| 342 |
+
" \n",
|
| 343 |
+
" if os.path.exists(cache_file):\n",
|
| 344 |
+
" logger.info(\"Loading cached documents\")\n",
|
| 345 |
+
" with open(cache_file, \"rb\") as f:\n",
|
| 346 |
+
" return pickle.load(f)\n",
|
| 347 |
+
" \n",
|
| 348 |
+
" with open(file_path, 'r', encoding='utf-8') as f:\n",
|
| 349 |
+
" data = json.load(f)\n",
|
| 350 |
+
" \n",
|
| 351 |
+
" documents = []\n",
|
| 352 |
+
" for item in tqdm(data, desc=\"Chunking Progress\"):\n",
|
| 353 |
+
" base_content = f\"\"\"Source: {item['Source']}\n",
|
| 354 |
+
"Application: {item['Application']}\n",
|
| 355 |
+
"Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
|
| 356 |
+
"Technical Concepts: {', '.join(item['technical_concepts'])}\n",
|
| 357 |
+
"Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
|
| 358 |
+
" \n",
|
| 359 |
+
" strategy = item['Strategy']\n",
|
| 360 |
+
" for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
|
| 361 |
+
" chunk = strategy[i:i + CHUNK_SIZE]\n",
|
| 362 |
+
" documents.append(Document(\n",
|
| 363 |
+
" page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
|
| 364 |
+
" metadata={\n",
|
| 365 |
+
" \"source\": item[\"Source\"],\n",
|
| 366 |
+
" \"application\": item[\"Application\"],\n",
|
| 367 |
+
" \"technical_concepts\": item[\"technical_concepts\"],\n",
|
| 368 |
+
" \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
|
| 369 |
+
" \"hyperlink\": item[\"Hyperlink\"],\n",
|
| 370 |
+
" \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
|
| 371 |
+
" }\n",
|
| 372 |
+
" ))\n",
|
| 373 |
+
" \n",
|
| 374 |
+
" with open(cache_file, \"wb\") as f:\n",
|
| 375 |
+
" pickle.dump(documents, f)\n",
|
| 376 |
+
" return documents\n",
|
| 377 |
+
"\n",
|
| 378 |
+
"# --- Optimized Retrieval System ---\n",
|
| 379 |
+
"class EnhancedRetriever:\n",
|
| 380 |
+
" \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
|
| 381 |
+
" def __init__(self, documents: List[Document]):\n",
|
| 382 |
+
" self.documents = documents\n",
|
| 383 |
+
" self.bm25 = self._init_bm25()\n",
|
| 384 |
+
" self.vector_store = self._init_faiss()\n",
|
| 385 |
+
" self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
|
| 386 |
+
"\n",
|
| 387 |
+
" def _init_bm25(self) -> BM25Retriever:\n",
|
| 388 |
+
" cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
|
| 389 |
+
" if os.path.exists(cache_key):\n",
|
| 390 |
+
" logger.info(\"Loading cached BM25 index\")\n",
|
| 391 |
+
" with open(cache_key, \"rb\") as f:\n",
|
| 392 |
+
" return pickle.load(f)\n",
|
| 393 |
+
" \n",
|
| 394 |
+
" logger.info(\"Building new BM25 index\")\n",
|
| 395 |
+
" retriever = BM25Retriever.from_documents(self.documents)\n",
|
| 396 |
+
" retriever.k = 5\n",
|
| 397 |
+
" with open(cache_key, \"wb\") as f:\n",
|
| 398 |
+
" pickle.dump(retriever, f)\n",
|
| 399 |
+
" return retriever\n",
|
| 400 |
+
"\n",
|
| 401 |
+
" def _init_faiss(self) -> FAISS:\n",
|
| 402 |
+
" cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
|
| 403 |
+
" if os.path.exists(cache_key):\n",
|
| 404 |
+
" logger.info(\"Loading cached FAISS index\")\n",
|
| 405 |
+
" return FAISS.load_local(\n",
|
| 406 |
+
" cache_key,\n",
|
| 407 |
+
" MistralEmbeddings(),\n",
|
| 408 |
+
" allow_dangerous_deserialization=True\n",
|
| 409 |
+
" )\n",
|
| 410 |
+
" \n",
|
| 411 |
+
" logger.info(\"Building new FAISS index\")\n",
|
| 412 |
+
" vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
|
| 413 |
+
" vector_store.save_local(cache_key)\n",
|
| 414 |
+
" return vector_store\n",
|
| 415 |
+
"\n",
|
| 416 |
+
" @lru_cache(maxsize=500)\n",
|
| 417 |
+
" def retrieve(self, query: str) -> str:\n",
|
| 418 |
+
" try:\n",
|
| 419 |
+
" processed_query = self._preprocess_query(query)\n",
|
| 420 |
+
" expanded_query = self._hyde_expansion(processed_query)\n",
|
| 421 |
+
" \n",
|
| 422 |
+
" bm25_results = self.bm25.invoke(processed_query)\n",
|
| 423 |
+
" vector_results = self.vector_retriever.invoke(processed_query)\n",
|
| 424 |
+
" expanded_results = self.bm25.invoke(expanded_query)\n",
|
| 425 |
+
" \n",
|
| 426 |
+
" fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
|
| 427 |
+
" return self._format_context(fused_results[:5])\n",
|
| 428 |
+
" except Exception as e:\n",
|
| 429 |
+
" logger.error(f\"Retrieval Error: {str(e)}\")\n",
|
| 430 |
+
" return \"\"\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" def _preprocess_query(self, query: str) -> str:\n",
|
| 433 |
+
" return query.lower().strip()\n",
|
| 434 |
+
"\n",
|
| 435 |
+
" @lru_cache(maxsize=500)\n",
|
| 436 |
+
" def _hyde_expansion(self, query: str) -> str:\n",
|
| 437 |
+
" try:\n",
|
| 438 |
+
" response = client.chat.completions.create(\n",
|
| 439 |
+
" model=generation_model,\n",
|
| 440 |
+
" messages=[{\n",
|
| 441 |
+
" \"role\": \"user\",\n",
|
| 442 |
+
" \"content\": f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
|
| 443 |
+
" }],\n",
|
| 444 |
+
" temperature=0.5,\n",
|
| 445 |
+
" max_tokens=200\n",
|
| 446 |
+
" )\n",
|
| 447 |
+
" return response.choices[0].message.content\n",
|
| 448 |
+
" except Exception as e:\n",
|
| 449 |
+
" logger.error(f\"HyDE Error: {str(e)}\")\n",
|
| 450 |
+
" return query\n",
|
| 451 |
+
"\n",
|
| 452 |
+
" def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
|
| 453 |
+
" fused_scores = defaultdict(float)\n",
|
| 454 |
+
" for docs in result_sets:\n",
|
| 455 |
+
" for rank, doc in enumerate(docs, 1):\n",
|
| 456 |
+
" fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
|
| 457 |
+
" \n",
|
| 458 |
+
" seen = set()\n",
|
| 459 |
+
" return [\n",
|
| 460 |
+
" doc for doc in sorted(\n",
|
| 461 |
+
" (doc for docs in result_sets for doc in docs),\n",
|
| 462 |
+
" key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
|
| 463 |
+
" reverse=True\n",
|
| 464 |
+
" ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
|
| 465 |
+
" ]\n",
|
| 466 |
+
"\n",
|
| 467 |
+
" def _format_context(self, docs: List[Document]) -> str:\n",
|
| 468 |
+
" context = []\n",
|
| 469 |
+
" for doc in docs:\n",
|
| 470 |
+
" context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
|
| 471 |
+
" **Application**: {doc.metadata['application']}\n",
|
| 472 |
+
" **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
|
| 473 |
+
" **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
|
| 474 |
+
" context.append(context_str)\n",
|
| 475 |
+
" return \"\\n\\n---\\n\\n\".join(context)\n",
|
| 476 |
+
"\n",
|
| 477 |
+
"# --- Generation System ---\n",
|
| 478 |
+
"SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
|
| 479 |
+
"1. Base answers strictly on context\n",
|
| 480 |
+
"2. Cite sources as [Source]\n",
|
| 481 |
+
"3. **Bold** technical terms\n",
|
| 482 |
+
"4. Include reference links\n",
|
| 483 |
+
"\n",
|
| 484 |
+
"Context: {context}\"\"\"\n",
|
| 485 |
+
"\n",
|
| 486 |
+
"@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
|
| 487 |
+
"def get_ai_response(query: str, context: str) -> str:\n",
|
| 488 |
+
" try:\n",
|
| 489 |
+
" response = client.chat.completions.create(\n",
|
| 490 |
+
" model=generation_model,\n",
|
| 491 |
+
" messages=[\n",
|
| 492 |
+
" {\"role\": \"system\", \"content\": SYSTEM_PROMPT.format(context=context)},\n",
|
| 493 |
+
" {\"role\": \"user\", \"content\": f\"Question: {query}\\nProvide a detailed technical answer:\"}\n",
|
| 494 |
+
" ],\n",
|
| 495 |
+
" temperature=0.4,\n",
|
| 496 |
+
" max_tokens=2000 # Increased max_tokens\n",
|
| 497 |
+
" )\n",
|
| 498 |
+
" logger.info(f\"Raw Response: {response.choices[0].message.content}\") # Log raw response\n",
|
| 499 |
+
" return _postprocess_response(response.choices[0].message.content)\n",
|
| 500 |
+
" except Exception as e:\n",
|
| 501 |
+
" logger.error(f\"Generation Error: {str(e)}\")\n",
|
| 502 |
+
" return \"I'm unable to generate a response right now. Please try again later.\"\n",
|
| 503 |
+
"\n",
|
| 504 |
+
"def _postprocess_response(response: str) -> str:\n",
|
| 505 |
+
" response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
|
| 506 |
+
" response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
|
| 507 |
+
" return response\n",
|
| 508 |
+
"\n",
|
| 509 |
+
"# --- Optimized Pipeline ---\n",
|
| 510 |
+
"documents = load_and_chunk_data(data_file_name)\n",
|
| 511 |
+
"retriever = EnhancedRetriever(documents)\n",
|
| 512 |
+
"\n",
|
| 513 |
+
"def generate_response(question: str) -> str:\n",
|
| 514 |
+
" try:\n",
|
| 515 |
+
" context = retriever.retrieve(question)\n",
|
| 516 |
+
" return get_ai_response(question, context) if context else \"No relevant information found.\"\n",
|
| 517 |
+
" except Exception as e:\n",
|
| 518 |
+
" logger.error(f\"Pipeline Error: {str(e)}\")\n",
|
| 519 |
+
" return \"An error occurred processing your request.\"\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"# --- Gradio Interface ---\n",
|
| 522 |
+
"def chat_interface(question: str, history: List[Tuple[str, str]]):\n",
|
| 523 |
+
" response = generate_response(question)\n",
|
| 524 |
+
" return \"\", history + [(question, response)]\n",
|
| 525 |
+
"\n",
|
| 526 |
+
"with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
|
| 527 |
+
" gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
|
| 528 |
+
" with gr.Row():\n",
|
| 529 |
+
" chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
|
| 530 |
+
" with gr.Row():\n",
|
| 531 |
+
" question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
|
| 532 |
+
" label=\"Inquiry\", scale=4)\n",
|
| 533 |
+
" clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
|
| 534 |
+
" \n",
|
| 535 |
+
" gr.Markdown(\"\"\"\n",
|
| 536 |
+
" <div style=\"text-align: center; color: #4a7c59;\">\n",
|
| 537 |
+
" <small>Powered by AskNature's Database | \n",
|
| 538 |
+
" Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
|
| 539 |
+
" </div>\"\"\")\n",
|
| 540 |
+
" question.submit(chat_interface, [question, chatbot], [question, chatbot])\n",
|
| 541 |
+
" clear_btn.click(lambda: [], None, chatbot)\n",
|
| 542 |
+
"\n",
|
| 543 |
+
"if __name__ == \"__main__\":\n",
|
| 544 |
+
" demo.launch(show_error=True)"
|
| 545 |
+
]
|
| 546 |
+
},
|
| 547 |
+
{
|
| 548 |
+
"cell_type": "code",
|
| 549 |
+
"execution_count": null,
|
| 550 |
+
"metadata": {},
|
| 551 |
+
"outputs": [],
|
| 552 |
+
"source": [
|
| 553 |
+
"from dotenv import load_dotenv\n",
|
| 554 |
+
"import os\n",
|
| 555 |
+
"load_dotenv()\n",
|
| 556 |
+
"print(os.getenv(\"API_KEY\"))"
|
| 557 |
+
]
|
| 558 |
+
},
|
| 559 |
+
{
|
| 560 |
+
"cell_type": "code",
|
| 561 |
+
"execution_count": null,
|
| 562 |
+
"metadata": {},
|
| 563 |
+
"outputs": [],
|
| 564 |
+
"source": []
|
| 565 |
+
},
|
| 566 |
+
{
|
| 567 |
+
"cell_type": "code",
|
| 568 |
+
"execution_count": null,
|
| 569 |
+
"metadata": {},
|
| 570 |
+
"outputs": [],
|
| 571 |
+
"source": []
|
| 572 |
+
},
|
| 573 |
+
{
|
| 574 |
+
"cell_type": "code",
|
| 575 |
+
"execution_count": null,
|
| 576 |
+
"metadata": {},
|
| 577 |
+
"outputs": [],
|
| 578 |
+
"source": []
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"cell_type": "code",
|
| 582 |
+
"execution_count": null,
|
| 583 |
+
"metadata": {},
|
| 584 |
+
"outputs": [],
|
| 585 |
+
"source": []
|
| 586 |
+
},
|
| 587 |
+
{
|
| 588 |
+
"cell_type": "code",
|
| 589 |
+
"execution_count": null,
|
| 590 |
+
"metadata": {},
|
| 591 |
+
"outputs": [],
|
| 592 |
+
"source": []
|
| 593 |
+
},
|
| 594 |
+
{
|
| 595 |
+
"cell_type": "code",
|
| 596 |
+
"execution_count": null,
|
| 597 |
+
"metadata": {},
|
| 598 |
+
"outputs": [],
|
| 599 |
+
"source": [
|
| 600 |
+
"# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation\n",
|
| 601 |
+
"\n",
|
| 602 |
+
"import json\n",
|
| 603 |
+
"import logging\n",
|
| 604 |
+
"import re\n",
|
| 605 |
+
"import os\n",
|
| 606 |
+
"import pickle\n",
|
| 607 |
+
"from typing import List, Tuple, Optional\n",
|
| 608 |
+
"import gradio as gr\n",
|
| 609 |
+
"from openai import OpenAI \n",
|
| 610 |
+
"from google import genai \n",
|
| 611 |
+
"from functools import lru_cache\n",
|
| 612 |
+
"from tenacity import retry, stop_after_attempt, wait_exponential\n",
|
| 613 |
+
"from langchain_community.retrievers import BM25Retriever\n",
|
| 614 |
+
"from langchain_community.vectorstores import FAISS\n",
|
| 615 |
+
"from langchain_core.embeddings import Embeddings\n",
|
| 616 |
+
"from langchain_core.documents import Document\n",
|
| 617 |
+
"from collections import defaultdict\n",
|
| 618 |
+
"import hashlib\n",
|
| 619 |
+
"from tqdm import tqdm \n",
|
| 620 |
+
"\n",
|
| 621 |
+
"from dotenv import load_dotenv\n",
|
| 622 |
+
"load_dotenv()\n",
|
| 623 |
+
"# --- Configuration ---\n",
|
| 624 |
+
"FAISS_INDEX_PATH = \"faiss_index\"\n",
|
| 625 |
+
"BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
|
| 626 |
+
"CACHE_VERSION = \"v1\" # Increment when data format changes\n",
|
| 627 |
+
"embedding_model = \"e5-mistral-7b-instruct\" # OpenAI embedding model\n",
|
| 628 |
+
"generation_model = \"gemini-2.0-flash\" # Gemini generation model\n",
|
| 629 |
+
"data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
|
| 630 |
+
"API_CONFIG = {\n",
|
| 631 |
+
" \"gemini_api_key\": os.getenv(\"GEMINI_API_KEY\") # Gemini API key for generation\n",
|
| 632 |
+
"}\n",
|
| 633 |
+
"\n",
|
| 634 |
+
"CHUNK_SIZE = 800\n",
|
| 635 |
+
"OVERLAP = 200\n",
|
| 636 |
+
"EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"# Initialize clients\n",
|
| 639 |
+
"OPENAI_API_CONFIG = {\n",
|
| 640 |
+
" \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
|
| 641 |
+
" \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
|
| 642 |
+
"}\n",
|
| 643 |
+
"client = OpenAI(**OPENAI_API_CONFIG)\n",
|
| 644 |
+
"gemini_client = genai.Client(api_key=API_CONFIG[\"gemini_api_key\"]) # Gemini client for generation\n",
|
| 645 |
+
"logging.basicConfig(level=logging.INFO)\n",
|
| 646 |
+
"logger = logging.getLogger(__name__)\n",
|
| 647 |
+
"\n",
|
| 648 |
+
"# --- Helper Functions ---\n",
|
| 649 |
+
"def get_data_hash(file_path: str) -> str:\n",
|
| 650 |
+
" \"\"\"Generate hash of data file for cache validation\"\"\"\n",
|
| 651 |
+
" with open(file_path, \"rb\") as f:\n",
|
| 652 |
+
" return hashlib.md5(f.read()).hexdigest()\n",
|
| 653 |
+
"\n",
|
| 654 |
+
"# --- Custom Embedding Handler with Progress Tracking ---\n",
|
| 655 |
+
"class MistralEmbeddings(Embeddings):\n",
|
| 656 |
+
" \"\"\"E5-Mistral-7B embedding adapter with error handling and progress tracking\"\"\"\n",
|
| 657 |
+
" def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
|
| 658 |
+
" embeddings = []\n",
|
| 659 |
+
" try:\n",
|
| 660 |
+
" # Process in batches with progress tracking\n",
|
| 661 |
+
" for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
|
| 662 |
+
" batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
|
| 663 |
+
" response = client.embeddings.create(\n",
|
| 664 |
+
" input=batch,\n",
|
| 665 |
+
" model=embedding_model,\n",
|
| 666 |
+
" encoding_format=\"float\"\n",
|
| 667 |
+
" )\n",
|
| 668 |
+
" embeddings.extend([e.embedding for e in response.data])\n",
|
| 669 |
+
" return embeddings\n",
|
| 670 |
+
" except Exception as e:\n",
|
| 671 |
+
" logger.error(f\"Embedding Error: {str(e)}\")\n",
|
| 672 |
+
" return [[] for _ in texts]\n",
|
| 673 |
+
" \n",
|
| 674 |
+
" def embed_query(self, text: str) -> List[float]:\n",
|
| 675 |
+
" return self.embed_documents([text])[0]\n",
|
| 676 |
+
"\n",
|
| 677 |
+
"# --- Data Processing with Cache Validation ---\n",
|
| 678 |
+
"def load_and_chunk_data(file_path: str) -> List[Document]:\n",
|
| 679 |
+
" \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
|
| 680 |
+
" current_hash = get_data_hash(file_path)\n",
|
| 681 |
+
" cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
|
| 682 |
+
" \n",
|
| 683 |
+
" if os.path.exists(cache_file):\n",
|
| 684 |
+
" logger.info(\"Loading cached documents\")\n",
|
| 685 |
+
" with open(cache_file, \"rb\") as f:\n",
|
| 686 |
+
" return pickle.load(f)\n",
|
| 687 |
+
" \n",
|
| 688 |
+
" with open(file_path, 'r', encoding='utf-8') as f:\n",
|
| 689 |
+
" data = json.load(f)\n",
|
| 690 |
+
" \n",
|
| 691 |
+
" documents = []\n",
|
| 692 |
+
" for item in tqdm(data, desc=\"Chunking Progress\"):\n",
|
| 693 |
+
" base_content = f\"\"\"Source: {item['Source']}\n",
|
| 694 |
+
"Application: {item['Application']}\n",
|
| 695 |
+
"Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
|
| 696 |
+
"Technical Concepts: {', '.join(item['technical_concepts'])}\n",
|
| 697 |
+
"Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
|
| 698 |
+
" \n",
|
| 699 |
+
" strategy = item['Strategy']\n",
|
| 700 |
+
" for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
|
| 701 |
+
" chunk = strategy[i:i + CHUNK_SIZE]\n",
|
| 702 |
+
" documents.append(Document(\n",
|
| 703 |
+
" page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
|
| 704 |
+
" metadata={\n",
|
| 705 |
+
" \"source\": item[\"Source\"],\n",
|
| 706 |
+
" \"application\": item[\"Application\"],\n",
|
| 707 |
+
" \"technical_concepts\": item[\"technical_concepts\"],\n",
|
| 708 |
+
" \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
|
| 709 |
+
" \"hyperlink\": item[\"Hyperlink\"],\n",
|
| 710 |
+
" \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
|
| 711 |
+
" }\n",
|
| 712 |
+
" ))\n",
|
| 713 |
+
" \n",
|
| 714 |
+
" with open(cache_file, \"wb\") as f:\n",
|
| 715 |
+
" pickle.dump(documents, f)\n",
|
| 716 |
+
" return documents\n",
|
| 717 |
+
"\n",
|
| 718 |
+
"# --- Optimized Retrieval System ---\n",
|
| 719 |
+
"class EnhancedRetriever:\n",
|
| 720 |
+
" \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
|
| 721 |
+
" def __init__(self, documents: List[Document]):\n",
|
| 722 |
+
" self.documents = documents\n",
|
| 723 |
+
" self.bm25 = self._init_bm25()\n",
|
| 724 |
+
" self.vector_store = self._init_faiss()\n",
|
| 725 |
+
" self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
|
| 726 |
+
"\n",
|
| 727 |
+
" def _init_bm25(self) -> BM25Retriever:\n",
|
| 728 |
+
" cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
|
| 729 |
+
" if os.path.exists(cache_key):\n",
|
| 730 |
+
" logger.info(\"Loading cached BM25 index\")\n",
|
| 731 |
+
" with open(cache_key, \"rb\") as f:\n",
|
| 732 |
+
" return pickle.load(f)\n",
|
| 733 |
+
" \n",
|
| 734 |
+
" logger.info(\"Building new BM25 index\")\n",
|
| 735 |
+
" retriever = BM25Retriever.from_documents(self.documents)\n",
|
| 736 |
+
" retriever.k = 5\n",
|
| 737 |
+
" with open(cache_key, \"wb\") as f:\n",
|
| 738 |
+
" pickle.dump(retriever, f)\n",
|
| 739 |
+
" return retriever\n",
|
| 740 |
+
"\n",
|
| 741 |
+
" def _init_faiss(self) -> FAISS:\n",
|
| 742 |
+
" cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
|
| 743 |
+
" if os.path.exists(cache_key):\n",
|
| 744 |
+
" logger.info(\"Loading cached FAISS index\")\n",
|
| 745 |
+
" return FAISS.load_local(\n",
|
| 746 |
+
" cache_key,\n",
|
| 747 |
+
" MistralEmbeddings(),\n",
|
| 748 |
+
" allow_dangerous_deserialization=True\n",
|
| 749 |
+
" )\n",
|
| 750 |
+
" \n",
|
| 751 |
+
" logger.info(\"Building new FAISS index\")\n",
|
| 752 |
+
" vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
|
| 753 |
+
" vector_store.save_local(cache_key)\n",
|
| 754 |
+
" return vector_store\n",
|
| 755 |
+
"\n",
|
| 756 |
+
" @lru_cache(maxsize=500)\n",
|
| 757 |
+
" def retrieve(self, query: str) -> str:\n",
|
| 758 |
+
" try:\n",
|
| 759 |
+
" processed_query = self._preprocess_query(query)\n",
|
| 760 |
+
" expanded_query = self._hyde_expansion(processed_query)\n",
|
| 761 |
+
" \n",
|
| 762 |
+
" bm25_results = self.bm25.invoke(processed_query)\n",
|
| 763 |
+
" vector_results = self.vector_retriever.invoke(processed_query)\n",
|
| 764 |
+
" expanded_results = self.bm25.invoke(expanded_query)\n",
|
| 765 |
+
" \n",
|
| 766 |
+
" fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
|
| 767 |
+
" return self._format_context(fused_results[:5])\n",
|
| 768 |
+
" except Exception as e:\n",
|
| 769 |
+
" logger.error(f\"Retrieval Error: {str(e)}\")\n",
|
| 770 |
+
" return \"\"\n",
|
| 771 |
+
"\n",
|
| 772 |
+
" def _preprocess_query(self, query: str) -> str:\n",
|
| 773 |
+
" return query.lower().strip()\n",
|
| 774 |
+
"\n",
|
| 775 |
+
" @lru_cache(maxsize=500)\n",
|
| 776 |
+
" def _hyde_expansion(self, query: str) -> str:\n",
|
| 777 |
+
" try:\n",
|
| 778 |
+
" response = gemini_client.models.generate_content( # Use Gemini client for HyDE\n",
|
| 779 |
+
" model=generation_model,\n",
|
| 780 |
+
" contents=f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
|
| 781 |
+
" )\n",
|
| 782 |
+
" return response.text\n",
|
| 783 |
+
" except Exception as e:\n",
|
| 784 |
+
" logger.error(f\"HyDE Error: {str(e)}\")\n",
|
| 785 |
+
" return query\n",
|
| 786 |
+
"\n",
|
| 787 |
+
" def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
|
| 788 |
+
" fused_scores = defaultdict(float)\n",
|
| 789 |
+
" for docs in result_sets:\n",
|
| 790 |
+
" for rank, doc in enumerate(docs, 1):\n",
|
| 791 |
+
" fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
|
| 792 |
+
" \n",
|
| 793 |
+
" seen = set()\n",
|
| 794 |
+
" return [\n",
|
| 795 |
+
" doc for doc in sorted(\n",
|
| 796 |
+
" (doc for docs in result_sets for doc in docs),\n",
|
| 797 |
+
" key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
|
| 798 |
+
" reverse=True\n",
|
| 799 |
+
" ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
|
| 800 |
+
" ]\n",
|
| 801 |
+
"\n",
|
| 802 |
+
" def _format_context(self, docs: List[Document]) -> str:\n",
|
| 803 |
+
" context = []\n",
|
| 804 |
+
" for doc in docs:\n",
|
| 805 |
+
" context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
|
| 806 |
+
" **Application**: {doc.metadata['application']}\n",
|
| 807 |
+
" **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
|
| 808 |
+
" **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
|
| 809 |
+
" context.append(context_str)\n",
|
| 810 |
+
" return \"\\n\\n---\\n\\n\".join(context)\n",
|
| 811 |
+
"\n",
|
| 812 |
+
"# --- Generation System ---\n",
|
| 813 |
+
"SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
|
| 814 |
+
"1. Base answers strictly on context\n",
|
| 815 |
+
"2. **Bold** technical terms\n",
|
| 816 |
+
"3. Include reference links at the end of the response\n",
|
| 817 |
+
"\n",
|
| 818 |
+
"Context: {context}\"\"\"\n",
|
| 819 |
+
"\n",
|
| 820 |
+
"@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
|
| 821 |
+
"def get_ai_response(query: str, context: str) -> str:\n",
|
| 822 |
+
" try:\n",
|
| 823 |
+
" response = gemini_client.models.generate_content( # Use Gemini client for generation\n",
|
| 824 |
+
" model=generation_model,\n",
|
| 825 |
+
" contents=f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
|
| 826 |
+
" )\n",
|
| 827 |
+
" logger.info(f\"Raw Response: {response.text}\") # Log raw response\n",
|
| 828 |
+
" return _postprocess_response(response.text)\n",
|
| 829 |
+
" except Exception as e:\n",
|
| 830 |
+
" logger.error(f\"Generation Error: {str(e)}\")\n",
|
| 831 |
+
" return \"I'm unable to generate a response right now. Please try again later.\"\n",
|
| 832 |
+
"\n",
|
| 833 |
+
"def _postprocess_response(response: str) -> str:\n",
|
| 834 |
+
" response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
|
| 835 |
+
" response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
|
| 836 |
+
" return response\n",
|
| 837 |
+
"\n",
|
| 838 |
+
"# --- Optimized Pipeline ---\n",
|
| 839 |
+
"documents = load_and_chunk_data(data_file_name)\n",
|
| 840 |
+
"retriever = EnhancedRetriever(documents)\n",
|
| 841 |
+
"\n",
|
| 842 |
+
"def generate_response(question: str) -> str:\n",
|
| 843 |
+
" try:\n",
|
| 844 |
+
" context = retriever.retrieve(question)\n",
|
| 845 |
+
" return get_ai_response(question, context) if context else \"No relevant information found.\"\n",
|
| 846 |
+
" except Exception as e:\n",
|
| 847 |
+
" logger.error(f\"Pipeline Error: {str(e)}\")\n",
|
| 848 |
+
" return \"An error occurred processing your request.\"\n",
|
| 849 |
+
"\n",
|
| 850 |
+
"# --- Gradio Interface ---\n",
|
| 851 |
+
"def chat_interface(question: str, history: List[Tuple[str, str]]):\n",
|
| 852 |
+
" response = generate_response(question)\n",
|
| 853 |
+
" return \"\", history + [(question, response)]\n",
|
| 854 |
+
"\n",
|
| 855 |
+
"with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
|
| 856 |
+
" gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
|
| 857 |
+
" with gr.Row():\n",
|
| 858 |
+
" chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
|
| 859 |
+
" with gr.Row():\n",
|
| 860 |
+
" question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
|
| 861 |
+
" label=\"Inquiry\", scale=4)\n",
|
| 862 |
+
" clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
|
| 863 |
+
" \n",
|
| 864 |
+
" gr.Markdown(\"\"\"\n",
|
| 865 |
+
" <div style=\"text-align: center; color: #4a7c59;\">\n",
|
| 866 |
+
" <small>Powered by AskNature's Database | \n",
|
| 867 |
+
" Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
|
| 868 |
+
" </div>\"\"\")\n",
|
| 869 |
+
" question.submit(chat_interface, [question, chatbot], [question, chatbot])\n",
|
| 870 |
+
" clear_btn.click(lambda: [], None, chatbot)\n",
|
| 871 |
+
"\n",
|
| 872 |
+
"if __name__ == \"__main__\":\n",
|
| 873 |
+
" demo.launch(show_error=True)"
|
| 874 |
+
]
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"cell_type": "code",
|
| 878 |
+
"execution_count": null,
|
| 879 |
+
"metadata": {},
|
| 880 |
+
"outputs": [],
|
| 881 |
+
"source": []
|
| 882 |
+
},
|
| 883 |
+
{
|
| 884 |
+
"cell_type": "code",
|
| 885 |
+
"execution_count": null,
|
| 886 |
+
"metadata": {},
|
| 887 |
+
"outputs": [],
|
| 888 |
+
"source": []
|
| 889 |
+
},
|
| 890 |
+
{
|
| 891 |
+
"cell_type": "code",
|
| 892 |
+
"execution_count": null,
|
| 893 |
+
"metadata": {},
|
| 894 |
+
"outputs": [],
|
| 895 |
+
"source": [
|
| 896 |
+
"# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation with Rate Control\n",
|
| 897 |
+
"import json\n",
|
| 898 |
+
"import logging\n",
|
| 899 |
+
"import re\n",
|
| 900 |
+
"import os\n",
|
| 901 |
+
"import pickle\n",
|
| 902 |
+
"from typing import List, Tuple, Optional\n",
|
| 903 |
+
"import gradio as gr\n",
|
| 904 |
+
"from openai import OpenAI # For embeddings\n",
|
| 905 |
+
"from google import genai # For generation\n",
|
| 906 |
+
"from functools import lru_cache\n",
|
| 907 |
+
"from tenacity import retry, stop_after_attempt, wait_exponential\n",
|
| 908 |
+
"from langchain_community.retrievers import BM25Retriever\n",
|
| 909 |
+
"from langchain_community.vectorstores import FAISS\n",
|
| 910 |
+
"from langchain_core.embeddings import Embeddings\n",
|
| 911 |
+
"from langchain_core.documents import Document\n",
|
| 912 |
+
"from collections import defaultdict\n",
|
| 913 |
+
"import hashlib\n",
|
| 914 |
+
"from tqdm import tqdm # For progress tracking\n",
|
| 915 |
+
"import time # For rate limit testing\n",
|
| 916 |
+
"from threading import Thread # For concurrent requests\n",
|
| 917 |
+
"\n",
|
| 918 |
+
"from dotenv import load_dotenv\n",
|
| 919 |
+
"load_dotenv()\n",
|
| 920 |
+
"\n",
|
| 921 |
+
"# --- Configuration ---\n",
|
| 922 |
+
"FAISS_INDEX_PATH = \"faiss_index\"\n",
|
| 923 |
+
"BM25_INDEX_PATH = \"bm25_index.pkl\"\n",
|
| 924 |
+
"CACHE_VERSION = \"v1\" # Increment when data format changes\n",
|
| 925 |
+
"embedding_model = \"e5-mistral-7b-instruct\" # OpenAI embedding model\n",
|
| 926 |
+
"generation_model = \"gemini-2.0-flash\" # Gemini generation model\n",
|
| 927 |
+
"data_file_name = \"AskNatureNet_data_enhanced.json\"\n",
|
| 928 |
+
"EMBEDDING_BATCH_SIZE = 32 # Batch size for embedding API calls\n",
|
| 929 |
+
"\n",
|
| 930 |
+
"# List of Gemini API keys\n",
|
| 931 |
+
"GEMINI_API_KEYS = [\n",
|
| 932 |
+
" os.getenv(\"GEMINI_API_KEY_1\"),\n",
|
| 933 |
+
" os.getenv(\"GEMINI_API_KEY_2\")\n",
|
| 934 |
+
"]\n",
|
| 935 |
+
"\n",
|
| 936 |
+
"current_key_index = 0\n",
|
| 937 |
+
"\n",
|
| 938 |
+
"def get_gemini_client():\n",
|
| 939 |
+
" global current_key_index\n",
|
| 940 |
+
" api_key = GEMINI_API_KEYS[current_key_index]\n",
|
| 941 |
+
" print(f\"Using Gemini API Key: {api_key}\")\n",
|
| 942 |
+
" return genai.Client(api_key=api_key)\n",
|
| 943 |
+
"\n",
|
| 944 |
+
"def switch_gemini_key():\n",
|
| 945 |
+
" global current_key_index\n",
|
| 946 |
+
" current_key_index = (current_key_index + 1) % len(GEMINI_API_KEYS)\n",
|
| 947 |
+
" print(f\"Switched to Gemini API Key: {GEMINI_API_KEYS[current_key_index]}\")\n",
|
| 948 |
+
" return get_gemini_client()\n",
|
| 949 |
+
"\n",
|
| 950 |
+
"# Initialize clients\n",
|
| 951 |
+
"OPENAI_API_CONFIG = {\n",
|
| 952 |
+
" \"api_key\": os.getenv(\"OPENAI_API_KEY\"),\n",
|
| 953 |
+
" \"base_url\": \"https://chat-ai.academiccloud.de/v1\"\n",
|
| 954 |
+
"}\n",
|
| 955 |
+
"client = OpenAI(**OPENAI_API_CONFIG)\n",
|
| 956 |
+
"gemini_client = get_gemini_client() # Initialize with the first key\n",
|
| 957 |
+
"logging.basicConfig(level=logging.INFO)\n",
|
| 958 |
+
"logger = logging.getLogger(__name__)\n",
|
| 959 |
+
"\n",
|
| 960 |
+
"# --- Helper Functions ---\n",
|
| 961 |
+
"def get_data_hash(file_path: str) -> str:\n",
|
| 962 |
+
" \"\"\"Generate hash of data file for cache validation\"\"\"\n",
|
| 963 |
+
" with open(file_path, \"rb\") as f:\n",
|
| 964 |
+
" return hashlib.md5(f.read()).hexdigest()\n",
|
| 965 |
+
"\n",
|
| 966 |
+
"# --- Custom Embedding Handler with Progress Tracking ---\n",
|
| 967 |
+
"class MistralEmbeddings(Embeddings):\n",
|
| 968 |
+
" \"\"\"E5-Mistral-7B embedding adapter with error handling and progress tracking\"\"\"\n",
|
| 969 |
+
" def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
|
| 970 |
+
" embeddings = []\n",
|
| 971 |
+
" try:\n",
|
| 972 |
+
" # Process in batches with progress tracking\n",
|
| 973 |
+
" for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc=\"Embedding Progress\"):\n",
|
| 974 |
+
" batch = texts[i:i + EMBEDDING_BATCH_SIZE]\n",
|
| 975 |
+
" response = client.embeddings.create(\n",
|
| 976 |
+
" input=batch,\n",
|
| 977 |
+
" model=embedding_model,\n",
|
| 978 |
+
" encoding_format=\"float\"\n",
|
| 979 |
+
" )\n",
|
| 980 |
+
" embeddings.extend([e.embedding for e in response.data])\n",
|
| 981 |
+
" return embeddings\n",
|
| 982 |
+
" except Exception as e:\n",
|
| 983 |
+
" logger.error(f\"Embedding Error: {str(e)}\")\n",
|
| 984 |
+
" return [[] for _ in texts]\n",
|
| 985 |
+
" \n",
|
| 986 |
+
" def embed_query(self, text: str) -> List[float]:\n",
|
| 987 |
+
" return self.embed_documents([text])[0]\n",
|
| 988 |
+
"\n",
|
| 989 |
+
"# --- Data Processing with Cache Validation ---\n",
|
| 990 |
+
"def load_and_chunk_data(file_path: str) -> List[Document]:\n",
|
| 991 |
+
" \"\"\"Enhanced chunking with metadata preservation\"\"\"\n",
|
| 992 |
+
" current_hash = get_data_hash(file_path)\n",
|
| 993 |
+
" cache_file = f\"documents_{CACHE_VERSION}_{current_hash}.pkl\"\n",
|
| 994 |
+
" \n",
|
| 995 |
+
" if os.path.exists(cache_file):\n",
|
| 996 |
+
" logger.info(\"Loading cached documents\")\n",
|
| 997 |
+
" with open(cache_file, \"rb\") as f:\n",
|
| 998 |
+
" return pickle.load(f)\n",
|
| 999 |
+
" \n",
|
| 1000 |
+
" with open(file_path, 'r', encoding='utf-8') as f:\n",
|
| 1001 |
+
" data = json.load(f)\n",
|
| 1002 |
+
" \n",
|
| 1003 |
+
" documents = []\n",
|
| 1004 |
+
" for item in tqdm(data, desc=\"Chunking Progress\"):\n",
|
| 1005 |
+
" base_content = f\"\"\"Source: {item['Source']}\n",
|
| 1006 |
+
"Application: {item['Application']}\n",
|
| 1007 |
+
"Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}\n",
|
| 1008 |
+
"Technical Concepts: {', '.join(item['technical_concepts'])}\n",
|
| 1009 |
+
"Biological Mechanisms: {', '.join(item['biological_mechanisms'])}\"\"\"\n",
|
| 1010 |
+
" \n",
|
| 1011 |
+
" strategy = item['Strategy']\n",
|
| 1012 |
+
" for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):\n",
|
| 1013 |
+
" chunk = strategy[i:i + CHUNK_SIZE]\n",
|
| 1014 |
+
" documents.append(Document(\n",
|
| 1015 |
+
" page_content=f\"{base_content}\\nStrategy Excerpt:\\n{chunk}\",\n",
|
| 1016 |
+
" metadata={\n",
|
| 1017 |
+
" \"source\": item[\"Source\"],\n",
|
| 1018 |
+
" \"application\": item[\"Application\"],\n",
|
| 1019 |
+
" \"technical_concepts\": item[\"technical_concepts\"],\n",
|
| 1020 |
+
" \"sustainability_impacts\": item[\"sustainability_impacts\"],\n",
|
| 1021 |
+
" \"hyperlink\": item[\"Hyperlink\"],\n",
|
| 1022 |
+
" \"chunk_id\": f\"{item['Source']}-{len(documents)+1}\"\n",
|
| 1023 |
+
" }\n",
|
| 1024 |
+
" ))\n",
|
| 1025 |
+
" \n",
|
| 1026 |
+
" with open(cache_file, \"wb\") as f:\n",
|
| 1027 |
+
" pickle.dump(documents, f)\n",
|
| 1028 |
+
" return documents\n",
|
| 1029 |
+
"\n",
|
| 1030 |
+
"# --- Optimized Retrieval System ---\n",
|
| 1031 |
+
"class EnhancedRetriever:\n",
|
| 1032 |
+
" \"\"\"Hybrid retriever with persistent caching\"\"\"\n",
|
| 1033 |
+
" def __init__(self, documents: List[Document]):\n",
|
| 1034 |
+
" self.documents = documents\n",
|
| 1035 |
+
" self.bm25 = self._init_bm25()\n",
|
| 1036 |
+
" self.vector_store = self._init_faiss()\n",
|
| 1037 |
+
" self.vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 3})\n",
|
| 1038 |
+
"\n",
|
| 1039 |
+
" def _init_bm25(self) -> BM25Retriever:\n",
|
| 1040 |
+
" cache_key = f\"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
|
| 1041 |
+
" if os.path.exists(cache_key):\n",
|
| 1042 |
+
" logger.info(\"Loading cached BM25 index\")\n",
|
| 1043 |
+
" with open(cache_key, \"rb\") as f:\n",
|
| 1044 |
+
" return pickle.load(f)\n",
|
| 1045 |
+
" \n",
|
| 1046 |
+
" logger.info(\"Building new BM25 index\")\n",
|
| 1047 |
+
" retriever = BM25Retriever.from_documents(self.documents)\n",
|
| 1048 |
+
" retriever.k = 5\n",
|
| 1049 |
+
" with open(cache_key, \"wb\") as f:\n",
|
| 1050 |
+
" pickle.dump(retriever, f)\n",
|
| 1051 |
+
" return retriever\n",
|
| 1052 |
+
"\n",
|
| 1053 |
+
" def _init_faiss(self) -> FAISS:\n",
|
| 1054 |
+
" cache_key = f\"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}\"\n",
|
| 1055 |
+
" if os.path.exists(cache_key):\n",
|
| 1056 |
+
" logger.info(\"Loading cached FAISS index\")\n",
|
| 1057 |
+
" return FAISS.load_local(\n",
|
| 1058 |
+
" cache_key,\n",
|
| 1059 |
+
" MistralEmbeddings(),\n",
|
| 1060 |
+
" allow_dangerous_deserialization=True\n",
|
| 1061 |
+
" )\n",
|
| 1062 |
+
" \n",
|
| 1063 |
+
" logger.info(\"Building new FAISS index\")\n",
|
| 1064 |
+
" vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())\n",
|
| 1065 |
+
" vector_store.save_local(cache_key)\n",
|
| 1066 |
+
" return vector_store\n",
|
| 1067 |
+
"\n",
|
| 1068 |
+
" @lru_cache(maxsize=500)\n",
|
| 1069 |
+
" def retrieve(self, query: str) -> str:\n",
|
| 1070 |
+
" try:\n",
|
| 1071 |
+
" processed_query = self._preprocess_query(query)\n",
|
| 1072 |
+
" expanded_query = self._hyde_expansion(processed_query)\n",
|
| 1073 |
+
" \n",
|
| 1074 |
+
" bm25_results = self.bm25.invoke(processed_query)\n",
|
| 1075 |
+
" vector_results = self.vector_retriever.invoke(processed_query)\n",
|
| 1076 |
+
" expanded_results = self.bm25.invoke(expanded_query)\n",
|
| 1077 |
+
" \n",
|
| 1078 |
+
" fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])\n",
|
| 1079 |
+
" return self._format_context(fused_results[:5])\n",
|
| 1080 |
+
" except Exception as e:\n",
|
| 1081 |
+
" logger.error(f\"Retrieval Error: {str(e)}\")\n",
|
| 1082 |
+
" return \"\"\n",
|
| 1083 |
+
"\n",
|
| 1084 |
+
" def _preprocess_query(self, query: str) -> str:\n",
|
| 1085 |
+
" return query.lower().strip()\n",
|
| 1086 |
+
"\n",
|
| 1087 |
+
" @lru_cache(maxsize=500)\n",
|
| 1088 |
+
" def _hyde_expansion(self, query: str) -> str:\n",
|
| 1089 |
+
" try:\n",
|
| 1090 |
+
" response = gemini_client.models.generate_content( # Use Gemini client for HyDE\n",
|
| 1091 |
+
" model=generation_model,\n",
|
| 1092 |
+
" contents=f\"Generate a technical draft about biomimicry for: {query}\\nInclude domain-specific terms.\"\n",
|
| 1093 |
+
" )\n",
|
| 1094 |
+
" return response.text\n",
|
| 1095 |
+
" except Exception as e:\n",
|
| 1096 |
+
" logger.error(f\"HyDE Error: {str(e)}\")\n",
|
| 1097 |
+
" return query\n",
|
| 1098 |
+
"\n",
|
| 1099 |
+
" def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:\n",
|
| 1100 |
+
" fused_scores = defaultdict(float)\n",
|
| 1101 |
+
" for docs in result_sets:\n",
|
| 1102 |
+
" for rank, doc in enumerate(docs, 1):\n",
|
| 1103 |
+
" fused_scores[doc.metadata[\"chunk_id\"]] += 1 / (rank + 60)\n",
|
| 1104 |
+
" \n",
|
| 1105 |
+
" seen = set()\n",
|
| 1106 |
+
" return [\n",
|
| 1107 |
+
" doc for doc in sorted(\n",
|
| 1108 |
+
" (doc for docs in result_sets for doc in docs),\n",
|
| 1109 |
+
" key=lambda x: fused_scores[x.metadata[\"chunk_id\"]],\n",
|
| 1110 |
+
" reverse=True\n",
|
| 1111 |
+
" ) if not (doc.metadata[\"chunk_id\"] in seen or seen.add(doc.metadata[\"chunk_id\"]))\n",
|
| 1112 |
+
" ]\n",
|
| 1113 |
+
"\n",
|
| 1114 |
+
" def _format_context(self, docs: List[Document]) -> str:\n",
|
| 1115 |
+
" context = []\n",
|
| 1116 |
+
" for doc in docs:\n",
|
| 1117 |
+
" context_str = f\"\"\"**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})\n",
|
| 1118 |
+
" **Application**: {doc.metadata['application']}\n",
|
| 1119 |
+
" **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}\n",
|
| 1120 |
+
" **Strategy Excerpt**:\\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}\"\"\"\n",
|
| 1121 |
+
" context.append(context_str)\n",
|
| 1122 |
+
" return \"\\n\\n---\\n\\n\".join(context)\n",
|
| 1123 |
+
"\n",
|
| 1124 |
+
"# --- Generation System ---\n",
|
| 1125 |
+
"SYSTEM_PROMPT = \"\"\"**Biomimicry Expert Guidelines**\n",
|
| 1126 |
+
"1. Base answers strictly on context\n",
|
| 1127 |
+
"2. **Bold** technical terms\n",
|
| 1128 |
+
"3. Include reference links at the end of the response\n",
|
| 1129 |
+
"\n",
|
| 1130 |
+
"Context: {context}\"\"\"\n",
|
| 1131 |
+
"\n",
|
| 1132 |
+
"@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))\n",
|
| 1133 |
+
"def get_ai_response(query: str, context: str) -> str:\n",
|
| 1134 |
+
" global gemini_client\n",
|
| 1135 |
+
" try:\n",
|
| 1136 |
+
" # Simulate a rate limit error for testing\n",
|
| 1137 |
+
" if \"test\" in query.lower():\n",
|
| 1138 |
+
" raise Exception(\"Simulated rate limit error\")\n",
|
| 1139 |
+
" \n",
|
| 1140 |
+
" response = gemini_client.models.generate_content( # Use Gemini client for generation\n",
|
| 1141 |
+
" model=generation_model,\n",
|
| 1142 |
+
" contents=f\"{SYSTEM_PROMPT.format(context=context)}\\nQuestion: {query}\\nProvide a detailed technical answer:\"\n",
|
| 1143 |
+
" )\n",
|
| 1144 |
+
" logger.info(f\"Raw Response: {response.text}\") # Log raw response\n",
|
| 1145 |
+
" return _postprocess_response(response.text)\n",
|
| 1146 |
+
" except Exception as e:\n",
|
| 1147 |
+
" logger.error(f\"Generation Error: {str(e)}\")\n",
|
| 1148 |
+
" gemini_client = switch_gemini_key() # Switch to the next API key\n",
|
| 1149 |
+
" return \"I'm unable to generate a response right now. Please try again later.\"\n",
|
| 1150 |
+
"\n",
|
| 1151 |
+
"def _postprocess_response(response: str) -> str:\n",
|
| 1152 |
+
" response = re.sub(r\"\\[(.*?)\\]\", r\"[\\1](#)\", response)\n",
|
| 1153 |
+
" response = re.sub(r\"\\*\\*([\\w-]+)\\*\\*\", r\"**\\1**\", response)\n",
|
| 1154 |
+
" return response\n",
|
| 1155 |
+
"\n",
|
| 1156 |
+
"# --- Optimized Pipeline ---\n",
|
| 1157 |
+
"documents = load_and_chunk_data(data_file_name)\n",
|
| 1158 |
+
"retriever = EnhancedRetriever(documents)\n",
|
| 1159 |
+
"\n",
|
| 1160 |
+
"def generate_response(question: str) -> str:\n",
|
| 1161 |
+
" try:\n",
|
| 1162 |
+
" context = retriever.retrieve(question)\n",
|
| 1163 |
+
" return get_ai_response(question, context) if context else \"No relevant information found.\"\n",
|
| 1164 |
+
" except Exception as e:\n",
|
| 1165 |
+
" logger.error(f\"Pipeline Error: {str(e)}\")\n",
|
| 1166 |
+
" return \"An error occurred processing your request.\"\n",
|
| 1167 |
+
"\n",
|
| 1168 |
+
"# --- Gradio Interface ---\n",
|
| 1169 |
+
"def chat_interface(question: str, history: List[Tuple[str, str]]):\n",
|
| 1170 |
+
" response = generate_response(question)\n",
|
| 1171 |
+
" return \"\", history + [(question, response)]\n",
|
| 1172 |
+
"\n",
|
| 1173 |
+
"with gr.Blocks(title=\"AskNature BioRAG Expert\", theme=gr.themes.Soft()) as demo:\n",
|
| 1174 |
+
" gr.Markdown(\"# 🌿 AskNature RAG-based Chatbot \")\n",
|
| 1175 |
+
" with gr.Row():\n",
|
| 1176 |
+
" chatbot = gr.Chatbot(label=\"Dialogue History\", height=500)\n",
|
| 1177 |
+
" with gr.Row():\n",
|
| 1178 |
+
" question = gr.Textbox(placeholder=\"Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')\",\n",
|
| 1179 |
+
" label=\"Inquiry\", scale=4)\n",
|
| 1180 |
+
" clear_btn = gr.Button(\"Clear History\", variant=\"secondary\")\n",
|
| 1181 |
+
" \n",
|
| 1182 |
+
" gr.Markdown(\"\"\"\n",
|
| 1183 |
+
" <div style=\"text-align: center; color: #4a7c59;\">\n",
|
| 1184 |
+
" <small>Powered by AskNature's Database | \n",
|
| 1185 |
+
" Explore nature's blueprints at <a href=\"https://asknature.org\">asknature.org</a></small>\n",
|
| 1186 |
+
" </div>\"\"\")\n",
|
| 1187 |
+
" question.submit(chat_interface, [question, chatbot], [question, chatbot])\n",
|
| 1188 |
+
" clear_btn.click(lambda: [], None, chatbot)\n",
|
| 1189 |
+
"\n",
|
| 1190 |
+
"# --- Rate Limit Testing ---\n",
|
| 1191 |
+
"def test_rate_limit():\n",
|
| 1192 |
+
" \"\"\"Simulate high-volume requests to test rate limit handling\"\"\"\n",
|
| 1193 |
+
" test_questions = [\n",
|
| 1194 |
+
" \"How do coral proteins help make eco-friendly fabrics without dyes?\",\n",
|
| 1195 |
+
" \"What environmental problems do coral-inspired textiles solve?\",\n",
|
| 1196 |
+
" \"What is industrial symbiosis and how does the Kalundborg example work?\",\n",
|
| 1197 |
+
" \"How do Metavision sensors work like human eyes to save energy?\",\n",
|
| 1198 |
+
" \"How does TISSIUM copy skin proteins for medical adhesives?\",\n",
|
| 1199 |
+
" \"How does DNA-level design create better fibers inspired by nature?\",\n",
|
| 1200 |
+
" \"Why is industrial symbiosis hard to implement despite benefits?\",\n",
|
| 1201 |
+
" \"How can biological systems inspire sustainable manufacturing?\",\n",
|
| 1202 |
+
" \"What other industries can use protein-based materials like Werewool?\",\n",
|
| 1203 |
+
" \"How could event-based cameras improve security systems?\",\n",
|
| 1204 |
+
" \"Design a factory network that works like coral reef partnerships - what features would it need?\"\n",
|
| 1205 |
+
" ]\n",
|
| 1206 |
+
"\n",
|
| 1207 |
+
" for i, question in enumerate(test_questions):\n",
|
| 1208 |
+
" print(f\"\\nSending query {i+1}: {question}\")\n",
|
| 1209 |
+
" response = generate_response(question)\n",
|
| 1210 |
+
" print(f\"Response: {response}\")\n",
|
| 1211 |
+
" time.sleep(0.5) # Add a small delay between requests\n",
|
| 1212 |
+
"\n",
|
| 1213 |
+
"# Run the rate limit test in a separate thread\n",
|
| 1214 |
+
"if __name__ == \"__main__\":\n",
|
| 1215 |
+
" gradio_thread = Thread(target=demo.launch, kwargs={\"show_error\": True})\n",
|
| 1216 |
+
" gradio_thread.start()\n",
|
| 1217 |
+
" time.sleep(5)\n",
|
| 1218 |
+
" test_rate_limit()"
|
| 1219 |
+
]
|
| 1220 |
+
},
|
| 1221 |
+
{
|
| 1222 |
+
"cell_type": "code",
|
| 1223 |
+
"execution_count": null,
|
| 1224 |
+
"metadata": {},
|
| 1225 |
+
"outputs": [],
|
| 1226 |
+
"source": []
|
| 1227 |
+
}
|
| 1228 |
+
],
|
| 1229 |
+
"metadata": {
|
| 1230 |
+
"kernelspec": {
|
| 1231 |
+
"display_name": "rag",
|
| 1232 |
+
"language": "python",
|
| 1233 |
+
"name": "python3"
|
| 1234 |
+
},
|
| 1235 |
+
"language_info": {
|
| 1236 |
+
"codemirror_mode": {
|
| 1237 |
+
"name": "ipython",
|
| 1238 |
+
"version": 3
|
| 1239 |
+
},
|
| 1240 |
+
"file_extension": ".py",
|
| 1241 |
+
"mimetype": "text/x-python",
|
| 1242 |
+
"name": "python",
|
| 1243 |
+
"nbconvert_exporter": "python",
|
| 1244 |
+
"pygments_lexer": "ipython3",
|
| 1245 |
+
"version": "3.12.8"
|
| 1246 |
+
}
|
| 1247 |
+
},
|
| 1248 |
+
"nbformat": 4,
|
| 1249 |
+
"nbformat_minor": 2
|
| 1250 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
langchain
|
| 3 |
+
openai
|
| 4 |
+
tenacity
|
| 5 |
+
langchain-core
|
| 6 |
+
langchain-community
|
| 7 |
+
langchain-llm
|
| 8 |
+
protobuf
|
| 9 |
+
numpy
|
| 10 |
+
scipy
|
| 11 |
+
faiss-cpu
|
| 12 |
+
transformers
|
| 13 |
+
sentencepiece
|
| 14 |
+
regex
|
| 15 |
+
json5
|
| 16 |
+
rank_bm25
|
| 17 |
+
huggingface_hub
|
| 18 |
+
tqdm
|
| 19 |
+
sentence-transformers
|
| 20 |
+
<<<<<<< HEAD
|
| 21 |
+
google
|
| 22 |
+
google-cloud
|
| 23 |
+
=======
|
| 24 |
+
google
|
| 25 |
+
google-cloud
|
| 26 |
+
>>>>>>> 51466f9c2c65701d4b45dd8e842e1a151f75959b
|