Spaces:

IPTS-PRODDEV
/

AskNature_BioChat

Build error

App Files Files Community

Mohamed284 commited on Feb 7, 2025

Commit

e2e6886

verified ·

1 Parent(s): 7ea9067

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -273

app.py CHANGED Viewed

@@ -1,276 +1,3 @@
-# Optimized RAG System with E5-Mistral Embeddings and Gemini 2.0 Flash Generation
-import json
-import logging
-import re
-import os
-import pickle
-from typing import List, Tuple, Optional
-import gradio as gr
-from openai import OpenAI
-from google import genai
-from functools import lru_cache
-from tenacity import retry, stop_after_attempt, wait_exponential
-from langchain_community.retrievers import BM25Retriever
-from langchain_community.vectorstores import FAISS
-from langchain_core.embeddings import Embeddings
-from langchain_core.documents import Document
-from collections import defaultdict
-import hashlib
-from tqdm import tqdm
-from dotenv import load_dotenv
-load_dotenv()
-# --- Configuration ---
-FAISS_INDEX_PATH = "faiss_index"
-BM25_INDEX_PATH = "bm25_index.pkl"
-CACHE_VERSION = "v1"  # Increment when data format changes
-embedding_model = "e5-mistral-7b-instruct"  # OpenAI embedding model
-generation_model = "gemini-2.0-flash"  # Gemini generation model
-data_file_name = "AskNatureNet_data_enhanced.json"
-API_CONFIG = {
-    "gemini_api_key": os.getenv("GEMINI_API_KEY")  # Gemini API key for generation
-}
-CHUNK_SIZE = 800
-OVERLAP = 200
-EMBEDDING_BATCH_SIZE = 32  # Batch size for embedding API calls
-# Initialize clients
-OPENAI_API_CONFIG = {
-    "api_key": os.getenv("OPENAI_API_KEY"),
-    "base_url": "https://chat-ai.academiccloud.de/v1"
-}
-client = OpenAI(**OPENAI_API_CONFIG)
-gemini_client = genai.Client(api_key=API_CONFIG["gemini_api_key"])  # Gemini client for generation
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# --- Helper Functions ---
-def get_data_hash(file_path: str) -> str:
-    """Generate hash of data file for cache validation"""
-    with open(file_path, "rb") as f:
-        return hashlib.md5(f.read()).hexdigest()
-# --- Custom Embedding Handler with Progress Tracking ---
-class MistralEmbeddings(Embeddings):
-    """E5-Mistral-7B embedding adapter with error handling and progress tracking"""
-    def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        embeddings = []
-        try:
-            # Process in batches with progress tracking
-            for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Embedding Progress"):
-                batch = texts[i:i + EMBEDDING_BATCH_SIZE]
-                response = client.embeddings.create(
-                    input=batch,
-                    model=embedding_model,
-                    encoding_format="float"
-                )
-                embeddings.extend([e.embedding for e in response.data])
-            return embeddings
-        except Exception as e:
-            logger.error(f"Embedding Error: {str(e)}")
-            return [[] for _ in texts]
-    def embed_query(self, text: str) -> List[float]:
-        return self.embed_documents([text])[0]
-# --- Data Processing with Cache Validation ---
-def load_and_chunk_data(file_path: str) -> List[Document]:
-    """Enhanced chunking with metadata preservation"""
-    current_hash = get_data_hash(file_path)
-    cache_file = f"documents_{CACHE_VERSION}_{current_hash}.pkl"
-    if os.path.exists(cache_file):
-        logger.info("Loading cached documents")
-        with open(cache_file, "rb") as f:
-            return pickle.load(f)
-    with open(file_path, 'r', encoding='utf-8') as f:
-        data = json.load(f)
-    documents = []
-    for item in tqdm(data, desc="Chunking Progress"):
-        base_content = f"""Source: {item['Source']}
-Application: {item['Application']}
-Functions: {', '.join(filter(None, [item.get('Function1'), item.get('Function2')]))}
-Technical Concepts: {', '.join(item['technical_concepts'])}
-Biological Mechanisms: {', '.join(item['biological_mechanisms'])}"""
-        strategy = item['Strategy']
-        for i in range(0, len(strategy), CHUNK_SIZE - OVERLAP):
-            chunk = strategy[i:i + CHUNK_SIZE]
-            documents.append(Document(
-                page_content=f"{base_content}\nStrategy Excerpt:\n{chunk}",
-                metadata={
-                    "source": item["Source"],
-                    "application": item["Application"],
-                    "technical_concepts": item["technical_concepts"],
-                    "sustainability_impacts": item["sustainability_impacts"],
-                    "hyperlink": item["Hyperlink"],
-                    "chunk_id": f"{item['Source']}-{len(documents)+1}"
-                }
-            ))
-    with open(cache_file, "wb") as f:
-        pickle.dump(documents, f)
-    return documents
-# --- Optimized Retrieval System ---
-class EnhancedRetriever:
-    """Hybrid retriever with persistent caching"""
-    def __init__(self, documents: List[Document]):
-        self.documents = documents
-        self.bm25 = self._init_bm25()
-        self.vector_store = self._init_faiss()
-        self.vector_retriever = self.vector_store.as_retriever(search_kwargs={"k": 3})
-    def _init_bm25(self) -> BM25Retriever:
-        cache_key = f"{BM25_INDEX_PATH}_{get_data_hash(data_file_name)}"
-        if os.path.exists(cache_key):
-            logger.info("Loading cached BM25 index")
-            with open(cache_key, "rb") as f:
-                return pickle.load(f)
-        logger.info("Building new BM25 index")
-        retriever = BM25Retriever.from_documents(self.documents)
-        retriever.k = 5
-        with open(cache_key, "wb") as f:
-            pickle.dump(retriever, f)
-        return retriever
-    def _init_faiss(self) -> FAISS:
-        cache_key = f"{FAISS_INDEX_PATH}_{get_data_hash(data_file_name)}"
-        if os.path.exists(cache_key):
-            logger.info("Loading cached FAISS index")
-            return FAISS.load_local(
-                cache_key,
-                MistralEmbeddings(),
-                allow_dangerous_deserialization=True
-            )
-        logger.info("Building new FAISS index")
-        vector_store = FAISS.from_documents(self.documents, MistralEmbeddings())
-        vector_store.save_local(cache_key)
-        return vector_store
-    @lru_cache(maxsize=500)
-    def retrieve(self, query: str) -> str:
-        try:
-            processed_query = self._preprocess_query(query)
-            expanded_query = self._hyde_expansion(processed_query)
-            bm25_results = self.bm25.invoke(processed_query)
-            vector_results = self.vector_retriever.invoke(processed_query)
-            expanded_results = self.bm25.invoke(expanded_query)
-            fused_results = self._fuse_results([bm25_results, vector_results, expanded_results])
-            return self._format_context(fused_results[:5])
-        except Exception as e:
-            logger.error(f"Retrieval Error: {str(e)}")
-            return ""
-    def _preprocess_query(self, query: str) -> str:
-        return query.lower().strip()
-    @lru_cache(maxsize=500)
-    def _hyde_expansion(self, query: str) -> str:
-        try:
-            response = gemini_client.models.generate_content(  # Use Gemini client for HyDE
-                model=generation_model,
-                contents=f"Generate a technical draft about biomimicry for: {query}\nInclude domain-specific terms."
-            )
-            return response.text
-        except Exception as e:
-            logger.error(f"HyDE Error: {str(e)}")
-            return query
-    def _fuse_results(self, result_sets: List[List[Document]]) -> List[Document]:
-        fused_scores = defaultdict(float)
-        for docs in result_sets:
-            for rank, doc in enumerate(docs, 1):
-                fused_scores[doc.metadata["chunk_id"]] += 1 / (rank + 60)
-        seen = set()
-        return [
-            doc for doc in sorted(
-                (doc for docs in result_sets for doc in docs),
-                key=lambda x: fused_scores[x.metadata["chunk_id"]],
-                reverse=True
-            ) if not (doc.metadata["chunk_id"] in seen or seen.add(doc.metadata["chunk_id"]))
-        ]
-    def _format_context(self, docs: List[Document]) -> str:
-        context = []
-        for doc in docs:
-            context_str = f"""**Source**: [{doc.metadata['source']}]({doc.metadata['hyperlink']})
-    **Application**: {doc.metadata['application']}
-    **Key Concepts**: {', '.join(doc.metadata['technical_concepts'])}
-    **Strategy Excerpt**:\n{doc.page_content.split('Strategy Excerpt:')[-1].strip()}"""
-            context.append(context_str)
-        return "\n\n---\n\n".join(context)
-# --- Generation System ---
-SYSTEM_PROMPT = """**Biomimicry Expert Guidelines**
-1. Base answers strictly on context
-2. **Bold** technical terms
-3. Include reference links at the end of the response
-Context: {context}"""
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=20))
-def get_ai_response(query: str, context: str) -> str:
-    try:
-        response = gemini_client.models.generate_content(  # Use Gemini client for generation
-            model=generation_model,
-            contents=f"{SYSTEM_PROMPT.format(context=context)}\nQuestion: {query}\nProvide a detailed technical answer:"
-        )
-        logger.info(f"Raw Response: {response.text}")  # Log raw response
-        return _postprocess_response(response.text)
-    except Exception as e:
-        logger.error(f"Generation Error: {str(e)}")
-        return "I'm unable to generate a response right now. Please try again later."
-def _postprocess_response(response: str) -> str:
-    response = re.sub(r"\[(.*?)\]", r"[\1](#)", response)
-    response = re.sub(r"\*\*([\w-]+)\*\*", r"**\1**", response)
-    return response
-# --- Optimized Pipeline ---
-documents = load_and_chunk_data(data_file_name)
-retriever = EnhancedRetriever(documents)
-def generate_response(question: str) -> str:
-    try:
-        context = retriever.retrieve(question)
-        return get_ai_response(question, context) if context else "No relevant information found."
-    except Exception as e:
-        logger.error(f"Pipeline Error: {str(e)}")
-        return "An error occurred processing your request."
-# --- Gradio Interface ---
-def chat_interface(question: str, history: List[Tuple[str, str]]):
-    response = generate_response(question)
-    return "", history + [(question, response)]
-with gr.Blocks(title="AskNature BioRAG Expert", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🌿 AskNature RAG-based Chatbot ")
-    with gr.Row():
-        chatbot = gr.Chatbot(label="Dialogue History", height=500)
-    with gr.Row():
-        question = gr.Textbox(placeholder="Ask about biomimicry (e.g. 'How does Werewool use coral proteins to make fibers?')",
-                            label="Inquiry", scale=4)
-        clear_btn = gr.Button("Clear History", variant="secondary")
-    gr.Markdown("""
-        <div style="text-align: center; color: #4a7c59;">
-            <small>Powered by AskNature's Database |
-            Explore nature's blueprints at <a href="https://asknature.org">asknature.org</a></small>
-        </div>""")
-    question.submit(chat_interface, [question, chatbot], [question, chatbot])
-    clear_btn.click(lambda: [], None, chatbot)
-if __name__ == "__main__":
-=======
 # Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation
 import json



















































































































































































































































































1	# Optimized RAG System with E5-Mistral Embeddings and Gemini Flash Generation
2
3	import json