Spaces:

shahbazdev0
/

hierarchical-rag-eval

Sleeping

App Files Files Community

hh786 commited on Nov 6, 2025

Commit

49bd135

1 Parent(s): c54dcef

Deployment of Hierarchical RAG system

Browse files

Files changed (2) hide show

app.py +42 -23
requirements.txt +20 -25

app.py CHANGED Viewed

@@ -99,7 +99,7 @@ def initialize_system():
 def upload_documents(
-    files: List[str],
     hierarchy_choice: str,
     mask_pii: bool = False,
     progress=gr.Progress()
@@ -108,7 +108,7 @@ def upload_documents(
     Upload and validate documents.
     Args:
-        files: List of uploaded file paths
         hierarchy_choice: Selected hierarchy (hospital, bank, fluid_simulation)
         mask_pii: Whether to mask PII
         progress: Gradio progress tracker
@@ -124,7 +124,13 @@ def upload_documents(
     invalid_files = []
     valid_files = []
-    for file_path in files:
         ext = Path(file_path).suffix.lower()
         if ext in valid_extensions:
             valid_files.append(file_path)
@@ -158,29 +164,29 @@ def upload_documents(
     preview_text = "\n".join(preview_lines)
     if valid_files:
-        status = f"✓ {len(valid_files)} files ready for processing."
     else:
-        status = "✗ No valid files to process."
     return status, preview_text, stats
 # Update build_rag_index with better progress tracking
 def build_rag_index(
-    files: List[str],
     hierarchy_choice: str,
     chunk_size: int = 512,
     chunk_overlap: int = 50,
     mask_pii: bool = False,
     collection_name: str = "rag_documents",
-    use_llm_classification: bool = True,  # NEW
     progress=gr.Progress()
 ) -> Tuple[str, Dict[str, Any]]:
     """
     Build RAG index from uploaded documents.
     Args:
-        files: List of uploaded file paths
         hierarchy_choice: Selected hierarchy
         chunk_size: Chunk size in tokens
         chunk_overlap: Overlap between chunks
@@ -198,9 +204,24 @@ def build_rag_index(
         return "❌ No files to process.", {}
     try:
         # Initialize processor
         progress(0.05, desc="🔧 Initializing document processor...")
-        logger.info(f"Starting index build: {len(files)} files, hierarchy={hierarchy_choice}")
         processor = DocumentProcessor(
             hierarchy_name=hierarchy_choice,
@@ -211,14 +232,12 @@ def build_rag_index(
         )
         # Process documents
-        progress(0.15, desc=" Processing documents...")
         all_chunks = []
-        valid_files = [f for f in files if Path(f).suffix.lower() in {'.pdf', '.txt'}]
         for i, filepath in enumerate(valid_files):
             file_progress = 0.15 + (0.50 * i / len(valid_files))
-            progress(file_progress, desc=f" Processing {Path(filepath).name}... ({i+1}/{len(valid_files)})")
             try:
                 chunks = processor.process_document(filepath)
@@ -231,18 +250,18 @@ def build_rag_index(
         if not all_chunks:
             return "❌ No chunks extracted from documents. Please check your files.", {}
-        progress(0.65, desc=f" Extracted {len(all_chunks)} chunks, building vector index...")
         logger.info(f"Total chunks extracted: {len(all_chunks)}")
         # Index documents
         current_hierarchy = hierarchy_choice
         current_collection = collection_name
-        progress(0.75, desc=" Generating embeddings...")
         stats = index_manager.index_documents(all_chunks, collection_name)
         # Initialize RAG comparator
-        progress(0.85, desc=" Initializing RAG pipelines...")
         vector_store = index_manager.get_store(collection_name)
         api_key = os.getenv("OPENAI_API_KEY")
@@ -257,13 +276,13 @@ def build_rag_index(
         progress(1.0, desc="✅ Complete!")
         stats_display = {
-            " Status": "Successfully indexed",
-            " Total Chunks": stats.get("chunks_added", 0),
-            " Collection": collection_name,
-            " Hierarchy": hierarchy_choice,
-            " Embedding Model": stats.get("model_name", "Unknown"),
-            " Embedding Dimension": stats.get("embedding_dimension", 0),
-            " LLM Classification": "Enabled" if use_llm_classification else "Disabled"
         }
         status = f"""✅ **Successfully indexed {stats.get('chunks_added', 0)} chunks!**

 def upload_documents(
+    files: List[Any],  # Changed from List[str]
     hierarchy_choice: str,
     mask_pii: bool = False,
     progress=gr.Progress()
     Upload and validate documents.
     Args:
+        files: List of uploaded file objects
         hierarchy_choice: Selected hierarchy (hospital, bank, fluid_simulation)
         mask_pii: Whether to mask PII
         progress: Gradio progress tracker
     invalid_files = []
     valid_files = []
+    for file_obj in files:
+        # Handle both file path strings and file objects
+        if hasattr(file_obj, 'name'):
+            file_path = file_obj.name
+        else:
+            file_path = str(file_obj)
         ext = Path(file_path).suffix.lower()
         if ext in valid_extensions:
             valid_files.append(file_path)
     preview_text = "\n".join(preview_lines)
     if valid_files:
+        status = f"✅ {len(valid_files)} files ready for processing."
     else:
+        status = "❌ No valid files to process."
     return status, preview_text, stats
 # Update build_rag_index with better progress tracking
 def build_rag_index(
+    files: List[Any],  # Changed from List[str]
     hierarchy_choice: str,
     chunk_size: int = 512,
     chunk_overlap: int = 50,
     mask_pii: bool = False,
     collection_name: str = "rag_documents",
+    use_llm_classification: bool = True,
     progress=gr.Progress()
 ) -> Tuple[str, Dict[str, Any]]:
     """
     Build RAG index from uploaded documents.
     Args:
+        files: List of uploaded file objects
         hierarchy_choice: Selected hierarchy
         chunk_size: Chunk size in tokens
         chunk_overlap: Overlap between chunks
         return "❌ No files to process.", {}
     try:
+        # Convert file objects to paths
+        valid_files = []
+        for file_obj in files:
+            if hasattr(file_obj, 'name'):
+                file_path = file_obj.name
+            else:
+                file_path = str(file_obj)
+            ext = Path(file_path).suffix.lower()
+            if ext in {'.pdf', '.txt'}:
+                valid_files.append(file_path)
+        if not valid_files:
+            return "❌ No valid files to process.", {}
         # Initialize processor
         progress(0.05, desc="🔧 Initializing document processor...")
+        logger.info(f"Starting index build: {len(valid_files)} files, hierarchy={hierarchy_choice}")
         processor = DocumentProcessor(
             hierarchy_name=hierarchy_choice,
         )
         # Process documents
+        progress(0.15, desc="📄 Processing documents...")
         all_chunks = []
         for i, filepath in enumerate(valid_files):
             file_progress = 0.15 + (0.50 * i / len(valid_files))
+            progress(file_progress, desc=f"📖 Processing {Path(filepath).name}... ({i+1}/{len(valid_files)})")
             try:
                 chunks = processor.process_document(filepath)
         if not all_chunks:
             return "❌ No chunks extracted from documents. Please check your files.", {}
+        progress(0.65, desc=f"💾 Extracted {len(all_chunks)} chunks, building vector index...")
         logger.info(f"Total chunks extracted: {len(all_chunks)}")
         # Index documents
         current_hierarchy = hierarchy_choice
         current_collection = collection_name
+        progress(0.75, desc="🔍 Generating embeddings...")
         stats = index_manager.index_documents(all_chunks, collection_name)
         # Initialize RAG comparator
+        progress(0.85, desc="🤖 Initializing RAG pipelines...")
         vector_store = index_manager.get_store(collection_name)
         api_key = os.getenv("OPENAI_API_KEY")
         progress(1.0, desc="✅ Complete!")
         stats_display = {
+            "✅ Status": "Successfully indexed",
+            "📦 Total Chunks": stats.get("chunks_added", 0),
+            "🗂️ Collection": collection_name,
+            "🏷️ Hierarchy": hierarchy_choice,
+            "🧠 Embedding Model": stats.get("model_name", "Unknown"),
+            "📊 Embedding Dimension": stats.get("embedding_dimension", 0),
+            "🤖 LLM Classification": "Enabled" if use_llm_classification else "Disabled"
         }
         status = f"""✅ **Successfully indexed {stats.get('chunks_added', 0)} chunks!**

requirements.txt CHANGED Viewed

@@ -1,40 +1,35 @@
-# Core
-gradio>=4.44.0
-gradio_client>=0.18.0
-python-dotenv>=1.0.0
 # Document Processing
-PyPDF2>=3.0.0
-pyyaml>=6.0.1
 # Vector Database
-chromadb>=0.4.22
 # Embeddings & NLP
-torch==2.1.0
-transformers==4.35.0
-sentence-transformers==2.2.2
 # OpenAI
-openai>=1.0.0
-# Data Processing & Visualization
-pandas>=2.0.0
-numpy>=1.24.0
-matplotlib>=3.7.0
-seaborn>=0.12.0
-# Error Handling & Retry Logic
-tenacity>=8.2.0
-# Testing
-pytest>=7.4.0
-pytest-cov>=4.1.0
 # MCP Server
-fastapi>=0.104.0
-uvicorn>=0.24.0
-pydantic>=2.0.0
 # Utilities
-tiktoken>=0.5.0

+# Core - Minimal for HF Spaces
+gradio==4.44.0
+python-dotenv
 # Document Processing
+PyPDF2
+pyyaml
 # Vector Database
+chromadb
 # Embeddings & NLP
+sentence-transformers
 # OpenAI
+openai
+# Data Processing
+pandas
+numpy<2.0.0
+# Visualization (optional)
+matplotlib
+seaborn
+# Error Handling
+tenacity
 # MCP Server
+fastapi
+uvicorn
+pydantic
 # Utilities
+tiktoken