Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

App Files Files Community

Ariyan-Pro commited on Mar 31

Commit

eed2a86

verified ·

1 Parent(s): a491868

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -32

app.py CHANGED Viewed

@@ -3,46 +3,81 @@ import time
 import sys
 import os
-# Add the repo root to path so we can import the /app modules
 sys.path.append(os.path.dirname(__file__))
-# Import your three RAG implementations
-from app.rag_naive import NaiveRAG
-from app.rag_optimized import OptimizedRAG
-from app.no_compromise_rag import NoCompromiseRAG
-# -------------------------------------------------------------------
-# Initialize the three RAG systems once at startup.
-# If memory becomes an issue, we can lazy‑load them inside each function.
-# -------------------------------------------------------------------
-print("Initializing Naive RAG...")
-naive_rag = NaiveRAG()           # loads embedding model + FAISS index
-print("Initializing Optimized RAG...")
-optimized_rag = OptimizedRAG()   # loads the same + SQLite cache
-print("Initializing No‑Compromise RAG...")
-no_compromise_rag = NoCompromiseRAG()
-print("All RAG systems ready.")
-# -------------------------------------------------------------------
-# Define the query functions for each mode
-# -------------------------------------------------------------------
 def query_naive(question):
-    start = time.perf_counter()
-    answer, chunks_used, cache_hit = naive_rag.query(question)
-    latency = (time.perf_counter() - start) * 1000
-    return answer, f"{latency:.1f} ms", chunks_used, "Yes" if cache_hit else "No"
 def query_optimized(question):
-    start = time.perf_counter()
-    answer, chunks_used, cache_hit = optimized_rag.query(question)
-    latency = (time.perf_counter() - start) * 1000
-    return answer, f"{latency:.1f} ms", chunks_used, "Yes" if cache_hit else "No"
 def query_no_compromise(question):
-    start = time.perf_counter()
-    answer, chunks_used, cache_hit = no_compromise_rag.query(question)
-    latency = (time.perf_counter() - start) * 1000
-    return answer, f"{latency:.1f} ms", chunks_used, "Yes" if cache_hit else "No"
 # -------------------------------------------------------------------
 # Build the Gradio interface
@@ -112,6 +147,5 @@ with gr.Blocks(title="RAG Latency Optimization", theme=gr.themes.Soft()) as demo
     **Caching**: SQLite (Optimized) + LRU memory | **Generation**: Simulated (real LLM can be plugged in)
     """)
-# Launch the app
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import sys
 import os
+# Add repo root to path
 sys.path.append(os.path.dirname(__file__))
+# Global references to loaded systems
+_naive_rag = None
+_optimized_rag = None
+_no_compromise_rag = None
+_embedding_model = None   # shared model
+def get_embedding_model():
+    """Load the embedding model once and reuse it across all RAG classes."""
+    global _embedding_model
+    if _embedding_model is None:
+        from sentence_transformers import SentenceTransformer
+        print("Loading embedding model...")
+        _embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+    return _embedding_model
+def get_naive():
+    global _naive_rag
+    if _naive_rag is None:
+        from app.rag_naive import NaiveRAG
+        print("Initializing Naive RAG...")
+        # Pass the shared embedding model if the class supports it
+        # (you may need to modify your RAG classes to accept a model argument)
+        _naive_rag = NaiveRAG()
+        # If NaiveRAG has a set_embedding_model method, call it:
+        # _naive_rag.set_embedding_model(get_embedding_model())
+    return _naive_rag
+def get_optimized():
+    global _optimized_rag
+    if _optimized_rag is None:
+        from app.rag_optimized import OptimizedRAG
+        print("Initializing Optimized RAG...")
+        _optimized_rag = OptimizedRAG()
+    return _optimized_rag
+def get_no_compromise():
+    global _no_compromise_rag
+    if _no_compromise_rag is None:
+        from app.no_compromise_rag import NoCompromiseRAG
+        print("Initializing No-Compromise RAG...")
+        _no_compromise_rag = NoCompromiseRAG()
+    return _no_compromise_rag
 def query_naive(question):
+    try:
+        rag = get_naive()
+        start = time.perf_counter()
+        answer, chunks_used, cache_hit = rag.query(question)
+        latency = (time.perf_counter() - start) * 1000
+        return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No"
+    except Exception as e:
+        return f"Error: {e}", "0 ms", "0", "No"
 def query_optimized(question):
+    try:
+        rag = get_optimized()
+        start = time.perf_counter()
+        answer, chunks_used, cache_hit = rag.query(question)
+        latency = (time.perf_counter() - start) * 1000
+        return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No"
+    except Exception as e:
+        return f"Error: {e}", "0 ms", "0", "No"
 def query_no_compromise(question):
+    try:
+        rag = get_no_compromise()
+        start = time.perf_counter()
+        answer, chunks_used, cache_hit = rag.query(question)
+        latency = (time.perf_counter() - start) * 1000
+        return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No"
+    except Exception as e:
+        return f"Error: {e}", "0 ms", "0", "No"
 # -------------------------------------------------------------------
 # Build the Gradio interface
     **Caching**: SQLite (Optimized) + LRU memory | **Generation**: Simulated (real LLM can be plugged in)
     """)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)