pathfinder_dev

Runtime error

App Files Files Community

kiyer commited on Sep 26, 2025

Commit

99c0738

verified ·

1 Parent(s): 0bf4d73

trying some optimizations

Browse files

Files changed (1) hide show

app_gradio.py +342 -4

app_gradio.py CHANGED Viewed

@@ -45,10 +45,231 @@ from string import punctuation
 import pytextrank
 from prompts import *
 openai_key = os.environ['openai_key']
 cohere_key = os.environ['cohere_key']
 os.environ["OPENAI_API_KEY"] = os.environ['openai_key']
 def load_nlp():
     nlp = spacy.load("en_core_web_sm")
     nlp.add_pipe("textrank")
@@ -89,9 +310,11 @@ def load_arxiv_corpus():
     # arxiv_corpus.load_faiss_index('embed', 'data/astrophindex.faiss')
     # keeping it up to date with the dataset
-    arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data', split='train')
-    arxiv_corpus.add_faiss_index(column='embed')
-    print('loading arxiv corpus from disk')
     return arxiv_corpus
 class RetrievalSystem():
@@ -649,6 +872,121 @@ def run_pathfinder(query, top_k, extra_keywords, toggles, prompt_type, rag_type,
         yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
 def create_interface():
     custom_css = """
     #custom-slider-* {
@@ -687,7 +1025,7 @@ def create_interface():
                         inputs = [query, top_k, keywords, toggles, prompt_type, rag_type]
                         outputs = [ret_papers, search_results_state, qntype, conc, plot]
-                        btn.click(fn=run_pathfinder, inputs=inputs, outputs=outputs)
     return demo

 import pytextrank
 from prompts import *
+import os
+from datasets import load_dataset
+import pickle
+import faiss
+import numpy as np
+from functools import lru_cache
+import asyncio
+import aiohttp
+from concurrent.futures import ThreadPoolExecutor
+import time
+# Add to your main function
+import gc
+def cleanup_memory():
+    """Force garbage collection and clear caches"""
+    gc.collect()
+    chromadb.api.client.SharedSystemClient.clear_system_cache()
 openai_key = os.environ['openai_key']
 cohere_key = os.environ['cohere_key']
 os.environ["OPENAI_API_KEY"] = os.environ['openai_key']
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid tokenizer warnings
+os.environ["HF_DATASETS_CACHE"] = "./cache"     # Control cache location
+# Use Hugging Face's built-in caching
+from datasets import enable_caching
+enable_caching()
+class OptimizedDatasetLoader:
+    def __init__(self, cache_dir="./cache"):
+        self.cache_dir = cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+    @lru_cache(maxsize=1)
+    def load_arxiv_corpus_cached(self):
+        """Load dataset with aggressive caching"""
+        cache_path = os.path.join(self.cache_dir, "arxiv_corpus.pkl")
+        index_path = os.path.join(self.cache_dir, "faiss_index.bin")
+        # Try to load from cache first
+        if os.path.exists(cache_path) and os.path.exists(index_path):
+            print("Loading from cache...")
+            with open(cache_path, 'rb') as f:
+                arxiv_corpus = pickle.load(f)
+            # Load pre-built FAISS index
+            index = faiss.read_index(index_path)
+            arxiv_corpus._indexes = {'embed': index}
+            return arxiv_corpus
+        # If not cached, load and cache
+        print("Loading dataset and building cache...")
+        arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data', split='train')
+        arxiv_corpus.add_faiss_index(column='embed')
+        # Cache the dataset
+        with open(cache_path, 'wb') as f:
+            pickle.dump(arxiv_corpus, f)
+        # Cache the FAISS index
+        faiss.write_index(arxiv_corpus._indexes['embed'], index_path)
+        return arxiv_corpus
+class AsyncRetrievalSystem:
+    def __init__(self):
+        self.dataset = arxiv_corpus
+        self.openai_key = os.environ['openai_key']
+        self.executor = ThreadPoolExecutor(max_workers=4)
+    async def async_embedding_call(self, texts, session):
+        """Async embedding API call"""
+        headers = {
+            "Authorization": f"Bearer {self.openai_key}",
+            "Content-Type": "application/json"
+        }
+        data = {
+            "input": texts if isinstance(texts, list) else [texts],
+            "model": "text-embedding-3-small"
+        }
+        async with session.post(
+            "https://api.openai.com/v1/embeddings",
+            headers=headers,
+            json=data
+        ) as response:
+            result = await response.json()
+            return [item['embedding'] for item in result['data']]
+    async def async_llm_call(self, messages, session, temperature=0):
+        """Async LLM API call"""
+        headers = {
+            "Authorization": f"Bearer {self.openai_key}",
+            "Content-Type": "application/json"
+        }
+        data = {
+            "model": "gpt-4o-mini",
+            "messages": messages,
+            "temperature": temperature
+        }
+        async with session.post(
+            "https://api.openai.com/v1/chat/completions",
+            headers=headers,
+            json=data
+        ) as response:
+            result = await response.json()
+            return result['choices'][0]['message']['content']
+    async def parallel_retrieve_and_analyze(self, query, top_k=10):
+        """Run multiple operations in parallel"""
+        async with aiohttp.ClientSession() as session:
+            # Start all async operations
+            tasks = []
+            # 1. Get query embedding
+            embedding_task = self.async_embedding_call(query, session)
+            tasks.append(embedding_task)
+            # 2. Generate HyDE document (if enabled)
+            hyde_messages = [
+                ("system", "You are an expert astronomer. Generate an abstract..."),
+                ("human", query)
+            ]
+            hyde_task = self.async_llm_call(hyde_messages, session, temperature=0.5)
+            tasks.append(hyde_task)
+            # 3. Question type classification
+            qtype_messages = [
+                ("system", "Classify this question type..."),
+                ("human", query)
+            ]
+            qtype_task = self.async_llm_call(qtype_messages, session)
+            tasks.append(qtype_task)
+            # Wait for all to complete
+            query_embedding, hyde_doc, question_type = await asyncio.gather(*tasks)
+            return {
+                'embedding': query_embedding[0],
+                'hyde_doc': hyde_doc,
+                'question_type': question_type
+            }
+    def run_parallel_search(self, query, top_k=10):
+        """Wrapper to run async function"""
+        return asyncio.run(self.parallel_retrieve_and_analyze(query, top_k))
+class OptimizedEmbedding:
+    def __init__(self, openai_key, batch_size=100):
+        self.client = OpenAI(api_key=openai_key)
+        self.batch_size = batch_size
+        self.embed_model = "text-embedding-3-small"
+    def batch_embeddings(self, texts):
+        """Process embeddings in batches for efficiency"""
+        all_embeddings = []
+        for i in range(0, len(texts), self.batch_size):
+            batch = texts[i:i + self.batch_size]
+            try:
+                response = self.client.embeddings.create(
+                    input=batch,
+                    model=self.embed_model
+                )
+                batch_embeddings = [item.embedding for item in response.data]
+                all_embeddings.extend(batch_embeddings)
+            except Exception as e:
+                print(f"Batch embedding failed: {e}")
+                # Fallback to individual processing
+                for text in batch:
+                    emb = self.client.embeddings.create(
+                        input=[text],
+                        model=self.embed_model
+                    ).data[0].embedding
+                    all_embeddings.append(emb)
+        return all_embeddings
+class MemoryOptimizedRAG:
+    def __init__(self):
+        self.vectorstore_cache = {}
+    def create_vectorstore_cached(self, documents, collection_name):
+        """Cache vectorstore to avoid recreation"""
+        cache_key = f"{collection_name}_{len(documents)}"
+        if cache_key in self.vectorstore_cache:
+            return self.vectorstore_cache[cache_key]
+        # Clear ChromaDB cache before creating new vectorstore
+        chromadb.api.client.SharedSystemClient.clear_system_cache()
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=150,
+            chunk_overlap=50,
+            add_start_index=True
+        )
+        splits = text_splitter.split_documents(documents)
+        vectorstore = Chroma.from_documents(
+            documents=splits,
+            embedding=embeddings,
+            collection_name=collection_name
+        )
+        self.vectorstore_cache[cache_key] = vectorstore
+        return vectorstore
+    def cleanup_old_vectorstores(self, max_cache_size=3):
+        """Clean up old vectorstores to free memory"""
+        if len(self.vectorstore_cache) > max_cache_size:
+            # Remove oldest entries
+            oldest_keys = list(self.vectorstore_cache.keys())[:-max_cache_size]
+            for key in oldest_keys:
+                try:
+                    self.vectorstore_cache[key].delete_collection()
+                except:
+                    pass
+                del self.vectorstore_cache[key]
 def load_nlp():
     nlp = spacy.load("en_core_web_sm")
     nlp.add_pipe("textrank")
     # arxiv_corpus.load_faiss_index('embed', 'data/astrophindex.faiss')
     # keeping it up to date with the dataset
+    # arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data', split='train')
+    # arxiv_corpus.add_faiss_index(column='embed')
+    # print('loading arxiv corpus from disk')
+    loader = OptimizedDatasetLoader()
+    arxiv_corpus = loader.load_arxiv_corpus_cached()
     return arxiv_corpus
 class RetrievalSystem():
         yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
+async def run_pathfinder_optimized(query, top_k, extra_keywords, toggles,
+                                  prompt_type, rag_type, ec=None, progress=None):
+    """Optimized version of run_pathfinder with parallel processing"""
+    # Early validation
+    if check_mod(query):
+        yield None, "Query flagged by moderation", None, None, None
+        return
+    # Setup
+    input_keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
+    query_keywords = get_keywords(query)
+    ec.query_input_keywords = input_keywords + query_keywords
+    ec.toggles = toggles
+    # Configure retrieval method
+    ec.hyde = rag_type in ["Semantic + HyDE", "Semantic + HyDE + CoHERE"]
+    ec.rerank = rag_type in ["Semantic + CoHERE", "Semantic + HyDE + CoHERE"]
+    try:
+        if prompt_type == "Deep Research (BETA)":
+            # Deep research is inherently sequential, keep original implementation
+            formatted_df, rag_answer = deep_research(query, top_k=top_k, ec=ec)
+            yield formatted_df, rag_answer['answer'], None, None, None
+        else:
+            # Phase 1: Parallel initial operations
+            gr.Info("Starting parallel search operations...")
+            async with aiohttp.ClientSession() as session:
+                # Start retrieval
+                retrieval_task = asyncio.create_task(
+                    async_retrieve(ec, query, top_k, session)
+                )
+                # Start question type analysis (independent operation)
+                qtype_task = asyncio.create_task(
+                    async_question_type_analysis(query, session)
+                )
+                # Wait for retrieval to complete first
+                rs, small_df = await retrieval_task
+                formatted_df = ec.return_formatted_df(rs, small_df)
+                yield formatted_df, None, None, None, None
+                # Phase 2: RAG QA while question type analysis continues
+                gr.Info("Generating answer...")
+                rag_answer = await async_rag_qa(query, formatted_df, prompt_type, session)
+                yield formatted_df, rag_answer['answer'], None, None, None
+                # Phase 3: Parallel consensus and remaining operations
+                gr.Info("Finalizing analysis...")
+                consensus_task = asyncio.create_task(
+                    async_consensus_evaluation(query, formatted_df, session)
+                )
+                plot_task = asyncio.create_task(
+                    async_make_plot(formatted_df, top_k)
+                )
+                # Wait for question type and consensus
+                question_type_gen, consensus_answer = await asyncio.gather(
+                    qtype_task, consensus_task
+                )
+                # Format outputs
+                consensus = f'## Consensus \n{consensus_answer.consensus}\n\n{consensus_answer.explanation}\n\n > Relevance: {consensus_answer.relevance_score:.1f}'
+                qn_type = format_question_type(question_type_gen)
+                yield formatted_df, rag_answer['answer'], consensus, qn_type, None
+                # Final plot
+                fig = await plot_task
+                yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
+    except Exception as e:
+        print(f"Error in pathfinder: {e}")
+        yield None, f"Error: {str(e)}", None, None, None
+async def async_retrieve(ec, query, top_k, session):
+    """Async wrapper for retrieval"""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, ec.retrieve, query, top_k, True)
+async def async_rag_qa(query, formatted_df, prompt_type, session):
+    """Async wrapper for RAG QA"""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, run_rag_qa, query, formatted_df, prompt_type)
+async def async_consensus_evaluation(query, formatted_df, session):
+    """Async consensus evaluation"""
+    abstracts = [formatted_df['abstract'][i+1] for i in range(len(formatted_df))]
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, evaluate_overall_consensus, query, abstracts)
+async def async_question_type_analysis(query, session):
+    """Async question type analysis"""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, guess_question_type, query)
+async def async_make_plot(formatted_df, top_k):
+    """Async plot generation"""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, make_embedding_plot, formatted_df, top_k, None)
+def format_question_type(question_type_gen):
+    """Clean up question type output"""
+    if '<categorization>' in question_type_gen:
+        question_type_gen = question_type_gen.split('<categorization>')[1]
+    if '</categorization>' in question_type_gen:
+        question_type_gen = question_type_gen.split('</categorization>')[0]
+    return question_type_gen.replace('\n', ' \n')
 def create_interface():
     custom_css = """
     #custom-slider-* {
                         inputs = [query, top_k, keywords, toggles, prompt_type, rag_type]
                         outputs = [ret_papers, search_results_state, qntype, conc, plot]
+                        btn.click(fn=run_pathfinder_optimized, inputs=inputs, outputs=outputs)
     return demo