Spaces:
Sleeping
Sleeping
Deployment of Hierarchical RAG system
Browse files- app.py +42 -23
- requirements.txt +20 -25
app.py
CHANGED
|
@@ -99,7 +99,7 @@ def initialize_system():
|
|
| 99 |
|
| 100 |
|
| 101 |
def upload_documents(
|
| 102 |
-
files: List[
|
| 103 |
hierarchy_choice: str,
|
| 104 |
mask_pii: bool = False,
|
| 105 |
progress=gr.Progress()
|
|
@@ -108,7 +108,7 @@ def upload_documents(
|
|
| 108 |
Upload and validate documents.
|
| 109 |
|
| 110 |
Args:
|
| 111 |
-
files: List of uploaded file
|
| 112 |
hierarchy_choice: Selected hierarchy (hospital, bank, fluid_simulation)
|
| 113 |
mask_pii: Whether to mask PII
|
| 114 |
progress: Gradio progress tracker
|
|
@@ -124,7 +124,13 @@ def upload_documents(
|
|
| 124 |
invalid_files = []
|
| 125 |
valid_files = []
|
| 126 |
|
| 127 |
-
for
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
ext = Path(file_path).suffix.lower()
|
| 129 |
if ext in valid_extensions:
|
| 130 |
valid_files.append(file_path)
|
|
@@ -158,29 +164,29 @@ def upload_documents(
|
|
| 158 |
preview_text = "\n".join(preview_lines)
|
| 159 |
|
| 160 |
if valid_files:
|
| 161 |
-
status = f"
|
| 162 |
else:
|
| 163 |
-
status = "
|
| 164 |
|
| 165 |
return status, preview_text, stats
|
| 166 |
|
| 167 |
|
| 168 |
# Update build_rag_index with better progress tracking
|
| 169 |
def build_rag_index(
|
| 170 |
-
files: List[
|
| 171 |
hierarchy_choice: str,
|
| 172 |
chunk_size: int = 512,
|
| 173 |
chunk_overlap: int = 50,
|
| 174 |
mask_pii: bool = False,
|
| 175 |
collection_name: str = "rag_documents",
|
| 176 |
-
use_llm_classification: bool = True,
|
| 177 |
progress=gr.Progress()
|
| 178 |
) -> Tuple[str, Dict[str, Any]]:
|
| 179 |
"""
|
| 180 |
Build RAG index from uploaded documents.
|
| 181 |
|
| 182 |
Args:
|
| 183 |
-
files: List of uploaded file
|
| 184 |
hierarchy_choice: Selected hierarchy
|
| 185 |
chunk_size: Chunk size in tokens
|
| 186 |
chunk_overlap: Overlap between chunks
|
|
@@ -198,9 +204,24 @@ def build_rag_index(
|
|
| 198 |
return "β No files to process.", {}
|
| 199 |
|
| 200 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
# Initialize processor
|
| 202 |
progress(0.05, desc="π§ Initializing document processor...")
|
| 203 |
-
logger.info(f"Starting index build: {len(
|
| 204 |
|
| 205 |
processor = DocumentProcessor(
|
| 206 |
hierarchy_name=hierarchy_choice,
|
|
@@ -211,14 +232,12 @@ def build_rag_index(
|
|
| 211 |
)
|
| 212 |
|
| 213 |
# Process documents
|
| 214 |
-
progress(0.15, desc=" Processing documents...")
|
| 215 |
all_chunks = []
|
| 216 |
|
| 217 |
-
valid_files = [f for f in files if Path(f).suffix.lower() in {'.pdf', '.txt'}]
|
| 218 |
-
|
| 219 |
for i, filepath in enumerate(valid_files):
|
| 220 |
file_progress = 0.15 + (0.50 * i / len(valid_files))
|
| 221 |
-
progress(file_progress, desc=f" Processing {Path(filepath).name}... ({i+1}/{len(valid_files)})")
|
| 222 |
|
| 223 |
try:
|
| 224 |
chunks = processor.process_document(filepath)
|
|
@@ -231,18 +250,18 @@ def build_rag_index(
|
|
| 231 |
if not all_chunks:
|
| 232 |
return "β No chunks extracted from documents. Please check your files.", {}
|
| 233 |
|
| 234 |
-
progress(0.65, desc=f" Extracted {len(all_chunks)} chunks, building vector index...")
|
| 235 |
logger.info(f"Total chunks extracted: {len(all_chunks)}")
|
| 236 |
|
| 237 |
# Index documents
|
| 238 |
current_hierarchy = hierarchy_choice
|
| 239 |
current_collection = collection_name
|
| 240 |
|
| 241 |
-
progress(0.75, desc=" Generating embeddings...")
|
| 242 |
stats = index_manager.index_documents(all_chunks, collection_name)
|
| 243 |
|
| 244 |
# Initialize RAG comparator
|
| 245 |
-
progress(0.85, desc=" Initializing RAG pipelines...")
|
| 246 |
vector_store = index_manager.get_store(collection_name)
|
| 247 |
|
| 248 |
api_key = os.getenv("OPENAI_API_KEY")
|
|
@@ -257,13 +276,13 @@ def build_rag_index(
|
|
| 257 |
progress(1.0, desc="β
Complete!")
|
| 258 |
|
| 259 |
stats_display = {
|
| 260 |
-
" Status": "Successfully indexed",
|
| 261 |
-
" Total Chunks": stats.get("chunks_added", 0),
|
| 262 |
-
" Collection": collection_name,
|
| 263 |
-
" Hierarchy": hierarchy_choice,
|
| 264 |
-
" Embedding Model": stats.get("model_name", "Unknown"),
|
| 265 |
-
" Embedding Dimension": stats.get("embedding_dimension", 0),
|
| 266 |
-
" LLM Classification": "Enabled" if use_llm_classification else "Disabled"
|
| 267 |
}
|
| 268 |
|
| 269 |
status = f"""β
**Successfully indexed {stats.get('chunks_added', 0)} chunks!**
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
def upload_documents(
|
| 102 |
+
files: List[Any], # Changed from List[str]
|
| 103 |
hierarchy_choice: str,
|
| 104 |
mask_pii: bool = False,
|
| 105 |
progress=gr.Progress()
|
|
|
|
| 108 |
Upload and validate documents.
|
| 109 |
|
| 110 |
Args:
|
| 111 |
+
files: List of uploaded file objects
|
| 112 |
hierarchy_choice: Selected hierarchy (hospital, bank, fluid_simulation)
|
| 113 |
mask_pii: Whether to mask PII
|
| 114 |
progress: Gradio progress tracker
|
|
|
|
| 124 |
invalid_files = []
|
| 125 |
valid_files = []
|
| 126 |
|
| 127 |
+
for file_obj in files:
|
| 128 |
+
# Handle both file path strings and file objects
|
| 129 |
+
if hasattr(file_obj, 'name'):
|
| 130 |
+
file_path = file_obj.name
|
| 131 |
+
else:
|
| 132 |
+
file_path = str(file_obj)
|
| 133 |
+
|
| 134 |
ext = Path(file_path).suffix.lower()
|
| 135 |
if ext in valid_extensions:
|
| 136 |
valid_files.append(file_path)
|
|
|
|
| 164 |
preview_text = "\n".join(preview_lines)
|
| 165 |
|
| 166 |
if valid_files:
|
| 167 |
+
status = f"β
{len(valid_files)} files ready for processing."
|
| 168 |
else:
|
| 169 |
+
status = "β No valid files to process."
|
| 170 |
|
| 171 |
return status, preview_text, stats
|
| 172 |
|
| 173 |
|
| 174 |
# Update build_rag_index with better progress tracking
|
| 175 |
def build_rag_index(
|
| 176 |
+
files: List[Any], # Changed from List[str]
|
| 177 |
hierarchy_choice: str,
|
| 178 |
chunk_size: int = 512,
|
| 179 |
chunk_overlap: int = 50,
|
| 180 |
mask_pii: bool = False,
|
| 181 |
collection_name: str = "rag_documents",
|
| 182 |
+
use_llm_classification: bool = True,
|
| 183 |
progress=gr.Progress()
|
| 184 |
) -> Tuple[str, Dict[str, Any]]:
|
| 185 |
"""
|
| 186 |
Build RAG index from uploaded documents.
|
| 187 |
|
| 188 |
Args:
|
| 189 |
+
files: List of uploaded file objects
|
| 190 |
hierarchy_choice: Selected hierarchy
|
| 191 |
chunk_size: Chunk size in tokens
|
| 192 |
chunk_overlap: Overlap between chunks
|
|
|
|
| 204 |
return "β No files to process.", {}
|
| 205 |
|
| 206 |
try:
|
| 207 |
+
# Convert file objects to paths
|
| 208 |
+
valid_files = []
|
| 209 |
+
for file_obj in files:
|
| 210 |
+
if hasattr(file_obj, 'name'):
|
| 211 |
+
file_path = file_obj.name
|
| 212 |
+
else:
|
| 213 |
+
file_path = str(file_obj)
|
| 214 |
+
|
| 215 |
+
ext = Path(file_path).suffix.lower()
|
| 216 |
+
if ext in {'.pdf', '.txt'}:
|
| 217 |
+
valid_files.append(file_path)
|
| 218 |
+
|
| 219 |
+
if not valid_files:
|
| 220 |
+
return "β No valid files to process.", {}
|
| 221 |
+
|
| 222 |
# Initialize processor
|
| 223 |
progress(0.05, desc="π§ Initializing document processor...")
|
| 224 |
+
logger.info(f"Starting index build: {len(valid_files)} files, hierarchy={hierarchy_choice}")
|
| 225 |
|
| 226 |
processor = DocumentProcessor(
|
| 227 |
hierarchy_name=hierarchy_choice,
|
|
|
|
| 232 |
)
|
| 233 |
|
| 234 |
# Process documents
|
| 235 |
+
progress(0.15, desc="π Processing documents...")
|
| 236 |
all_chunks = []
|
| 237 |
|
|
|
|
|
|
|
| 238 |
for i, filepath in enumerate(valid_files):
|
| 239 |
file_progress = 0.15 + (0.50 * i / len(valid_files))
|
| 240 |
+
progress(file_progress, desc=f"π Processing {Path(filepath).name}... ({i+1}/{len(valid_files)})")
|
| 241 |
|
| 242 |
try:
|
| 243 |
chunks = processor.process_document(filepath)
|
|
|
|
| 250 |
if not all_chunks:
|
| 251 |
return "β No chunks extracted from documents. Please check your files.", {}
|
| 252 |
|
| 253 |
+
progress(0.65, desc=f"πΎ Extracted {len(all_chunks)} chunks, building vector index...")
|
| 254 |
logger.info(f"Total chunks extracted: {len(all_chunks)}")
|
| 255 |
|
| 256 |
# Index documents
|
| 257 |
current_hierarchy = hierarchy_choice
|
| 258 |
current_collection = collection_name
|
| 259 |
|
| 260 |
+
progress(0.75, desc="π Generating embeddings...")
|
| 261 |
stats = index_manager.index_documents(all_chunks, collection_name)
|
| 262 |
|
| 263 |
# Initialize RAG comparator
|
| 264 |
+
progress(0.85, desc="π€ Initializing RAG pipelines...")
|
| 265 |
vector_store = index_manager.get_store(collection_name)
|
| 266 |
|
| 267 |
api_key = os.getenv("OPENAI_API_KEY")
|
|
|
|
| 276 |
progress(1.0, desc="β
Complete!")
|
| 277 |
|
| 278 |
stats_display = {
|
| 279 |
+
"β
Status": "Successfully indexed",
|
| 280 |
+
"π¦ Total Chunks": stats.get("chunks_added", 0),
|
| 281 |
+
"ποΈ Collection": collection_name,
|
| 282 |
+
"π·οΈ Hierarchy": hierarchy_choice,
|
| 283 |
+
"π§ Embedding Model": stats.get("model_name", "Unknown"),
|
| 284 |
+
"π Embedding Dimension": stats.get("embedding_dimension", 0),
|
| 285 |
+
"π€ LLM Classification": "Enabled" if use_llm_classification else "Disabled"
|
| 286 |
}
|
| 287 |
|
| 288 |
status = f"""β
**Successfully indexed {stats.get('chunks_added', 0)} chunks!**
|
requirements.txt
CHANGED
|
@@ -1,40 +1,35 @@
|
|
| 1 |
-
# Core
|
| 2 |
-
gradio
|
| 3 |
-
|
| 4 |
-
python-dotenv>=1.0.0
|
| 5 |
|
| 6 |
# Document Processing
|
| 7 |
-
PyPDF2
|
| 8 |
-
pyyaml
|
| 9 |
|
| 10 |
# Vector Database
|
| 11 |
-
chromadb
|
| 12 |
|
| 13 |
# Embeddings & NLP
|
| 14 |
-
|
| 15 |
-
transformers==4.35.0
|
| 16 |
-
sentence-transformers==2.2.2
|
| 17 |
|
| 18 |
# OpenAI
|
| 19 |
-
openai
|
| 20 |
|
| 21 |
-
# Data Processing
|
| 22 |
-
pandas
|
| 23 |
-
numpy
|
| 24 |
-
matplotlib>=3.7.0
|
| 25 |
-
seaborn>=0.12.0
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
pytest-cov>=4.1.0
|
| 33 |
|
| 34 |
# MCP Server
|
| 35 |
-
fastapi
|
| 36 |
-
uvicorn
|
| 37 |
-
pydantic
|
| 38 |
|
| 39 |
# Utilities
|
| 40 |
-
tiktoken
|
|
|
|
| 1 |
+
# Core - Minimal for HF Spaces
|
| 2 |
+
gradio==4.44.0
|
| 3 |
+
python-dotenv
|
|
|
|
| 4 |
|
| 5 |
# Document Processing
|
| 6 |
+
PyPDF2
|
| 7 |
+
pyyaml
|
| 8 |
|
| 9 |
# Vector Database
|
| 10 |
+
chromadb
|
| 11 |
|
| 12 |
# Embeddings & NLP
|
| 13 |
+
sentence-transformers
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# OpenAI
|
| 16 |
+
openai
|
| 17 |
|
| 18 |
+
# Data Processing
|
| 19 |
+
pandas
|
| 20 |
+
numpy<2.0.0
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# Visualization (optional)
|
| 23 |
+
matplotlib
|
| 24 |
+
seaborn
|
| 25 |
|
| 26 |
+
# Error Handling
|
| 27 |
+
tenacity
|
|
|
|
| 28 |
|
| 29 |
# MCP Server
|
| 30 |
+
fastapi
|
| 31 |
+
uvicorn
|
| 32 |
+
pydantic
|
| 33 |
|
| 34 |
# Utilities
|
| 35 |
+
tiktoken
|