hh786 commited on
Commit
49bd135
Β·
1 Parent(s): c54dcef

Deployment of Hierarchical RAG system

Browse files
Files changed (2) hide show
  1. app.py +42 -23
  2. requirements.txt +20 -25
app.py CHANGED
@@ -99,7 +99,7 @@ def initialize_system():
99
 
100
 
101
  def upload_documents(
102
- files: List[str],
103
  hierarchy_choice: str,
104
  mask_pii: bool = False,
105
  progress=gr.Progress()
@@ -108,7 +108,7 @@ def upload_documents(
108
  Upload and validate documents.
109
 
110
  Args:
111
- files: List of uploaded file paths
112
  hierarchy_choice: Selected hierarchy (hospital, bank, fluid_simulation)
113
  mask_pii: Whether to mask PII
114
  progress: Gradio progress tracker
@@ -124,7 +124,13 @@ def upload_documents(
124
  invalid_files = []
125
  valid_files = []
126
 
127
- for file_path in files:
 
 
 
 
 
 
128
  ext = Path(file_path).suffix.lower()
129
  if ext in valid_extensions:
130
  valid_files.append(file_path)
@@ -158,29 +164,29 @@ def upload_documents(
158
  preview_text = "\n".join(preview_lines)
159
 
160
  if valid_files:
161
- status = f"βœ“ {len(valid_files)} files ready for processing."
162
  else:
163
- status = "βœ— No valid files to process."
164
 
165
  return status, preview_text, stats
166
 
167
 
168
  # Update build_rag_index with better progress tracking
169
  def build_rag_index(
170
- files: List[str],
171
  hierarchy_choice: str,
172
  chunk_size: int = 512,
173
  chunk_overlap: int = 50,
174
  mask_pii: bool = False,
175
  collection_name: str = "rag_documents",
176
- use_llm_classification: bool = True, # NEW
177
  progress=gr.Progress()
178
  ) -> Tuple[str, Dict[str, Any]]:
179
  """
180
  Build RAG index from uploaded documents.
181
 
182
  Args:
183
- files: List of uploaded file paths
184
  hierarchy_choice: Selected hierarchy
185
  chunk_size: Chunk size in tokens
186
  chunk_overlap: Overlap between chunks
@@ -198,9 +204,24 @@ def build_rag_index(
198
  return "❌ No files to process.", {}
199
 
200
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  # Initialize processor
202
  progress(0.05, desc="πŸ”§ Initializing document processor...")
203
- logger.info(f"Starting index build: {len(files)} files, hierarchy={hierarchy_choice}")
204
 
205
  processor = DocumentProcessor(
206
  hierarchy_name=hierarchy_choice,
@@ -211,14 +232,12 @@ def build_rag_index(
211
  )
212
 
213
  # Process documents
214
- progress(0.15, desc=" Processing documents...")
215
  all_chunks = []
216
 
217
- valid_files = [f for f in files if Path(f).suffix.lower() in {'.pdf', '.txt'}]
218
-
219
  for i, filepath in enumerate(valid_files):
220
  file_progress = 0.15 + (0.50 * i / len(valid_files))
221
- progress(file_progress, desc=f" Processing {Path(filepath).name}... ({i+1}/{len(valid_files)})")
222
 
223
  try:
224
  chunks = processor.process_document(filepath)
@@ -231,18 +250,18 @@ def build_rag_index(
231
  if not all_chunks:
232
  return "❌ No chunks extracted from documents. Please check your files.", {}
233
 
234
- progress(0.65, desc=f" Extracted {len(all_chunks)} chunks, building vector index...")
235
  logger.info(f"Total chunks extracted: {len(all_chunks)}")
236
 
237
  # Index documents
238
  current_hierarchy = hierarchy_choice
239
  current_collection = collection_name
240
 
241
- progress(0.75, desc=" Generating embeddings...")
242
  stats = index_manager.index_documents(all_chunks, collection_name)
243
 
244
  # Initialize RAG comparator
245
- progress(0.85, desc=" Initializing RAG pipelines...")
246
  vector_store = index_manager.get_store(collection_name)
247
 
248
  api_key = os.getenv("OPENAI_API_KEY")
@@ -257,13 +276,13 @@ def build_rag_index(
257
  progress(1.0, desc="βœ… Complete!")
258
 
259
  stats_display = {
260
- " Status": "Successfully indexed",
261
- " Total Chunks": stats.get("chunks_added", 0),
262
- " Collection": collection_name,
263
- " Hierarchy": hierarchy_choice,
264
- " Embedding Model": stats.get("model_name", "Unknown"),
265
- " Embedding Dimension": stats.get("embedding_dimension", 0),
266
- " LLM Classification": "Enabled" if use_llm_classification else "Disabled"
267
  }
268
 
269
  status = f"""βœ… **Successfully indexed {stats.get('chunks_added', 0)} chunks!**
 
99
 
100
 
101
  def upload_documents(
102
+ files: List[Any], # Changed from List[str]
103
  hierarchy_choice: str,
104
  mask_pii: bool = False,
105
  progress=gr.Progress()
 
108
  Upload and validate documents.
109
 
110
  Args:
111
+ files: List of uploaded file objects
112
  hierarchy_choice: Selected hierarchy (hospital, bank, fluid_simulation)
113
  mask_pii: Whether to mask PII
114
  progress: Gradio progress tracker
 
124
  invalid_files = []
125
  valid_files = []
126
 
127
+ for file_obj in files:
128
+ # Handle both file path strings and file objects
129
+ if hasattr(file_obj, 'name'):
130
+ file_path = file_obj.name
131
+ else:
132
+ file_path = str(file_obj)
133
+
134
  ext = Path(file_path).suffix.lower()
135
  if ext in valid_extensions:
136
  valid_files.append(file_path)
 
164
  preview_text = "\n".join(preview_lines)
165
 
166
  if valid_files:
167
+ status = f"βœ… {len(valid_files)} files ready for processing."
168
  else:
169
+ status = "❌ No valid files to process."
170
 
171
  return status, preview_text, stats
172
 
173
 
174
  # Update build_rag_index with better progress tracking
175
  def build_rag_index(
176
+ files: List[Any], # Changed from List[str]
177
  hierarchy_choice: str,
178
  chunk_size: int = 512,
179
  chunk_overlap: int = 50,
180
  mask_pii: bool = False,
181
  collection_name: str = "rag_documents",
182
+ use_llm_classification: bool = True,
183
  progress=gr.Progress()
184
  ) -> Tuple[str, Dict[str, Any]]:
185
  """
186
  Build RAG index from uploaded documents.
187
 
188
  Args:
189
+ files: List of uploaded file objects
190
  hierarchy_choice: Selected hierarchy
191
  chunk_size: Chunk size in tokens
192
  chunk_overlap: Overlap between chunks
 
204
  return "❌ No files to process.", {}
205
 
206
  try:
207
+ # Convert file objects to paths
208
+ valid_files = []
209
+ for file_obj in files:
210
+ if hasattr(file_obj, 'name'):
211
+ file_path = file_obj.name
212
+ else:
213
+ file_path = str(file_obj)
214
+
215
+ ext = Path(file_path).suffix.lower()
216
+ if ext in {'.pdf', '.txt'}:
217
+ valid_files.append(file_path)
218
+
219
+ if not valid_files:
220
+ return "❌ No valid files to process.", {}
221
+
222
  # Initialize processor
223
  progress(0.05, desc="πŸ”§ Initializing document processor...")
224
+ logger.info(f"Starting index build: {len(valid_files)} files, hierarchy={hierarchy_choice}")
225
 
226
  processor = DocumentProcessor(
227
  hierarchy_name=hierarchy_choice,
 
232
  )
233
 
234
  # Process documents
235
+ progress(0.15, desc="πŸ“„ Processing documents...")
236
  all_chunks = []
237
 
 
 
238
  for i, filepath in enumerate(valid_files):
239
  file_progress = 0.15 + (0.50 * i / len(valid_files))
240
+ progress(file_progress, desc=f"πŸ“– Processing {Path(filepath).name}... ({i+1}/{len(valid_files)})")
241
 
242
  try:
243
  chunks = processor.process_document(filepath)
 
250
  if not all_chunks:
251
  return "❌ No chunks extracted from documents. Please check your files.", {}
252
 
253
+ progress(0.65, desc=f"πŸ’Ύ Extracted {len(all_chunks)} chunks, building vector index...")
254
  logger.info(f"Total chunks extracted: {len(all_chunks)}")
255
 
256
  # Index documents
257
  current_hierarchy = hierarchy_choice
258
  current_collection = collection_name
259
 
260
+ progress(0.75, desc="πŸ” Generating embeddings...")
261
  stats = index_manager.index_documents(all_chunks, collection_name)
262
 
263
  # Initialize RAG comparator
264
+ progress(0.85, desc="πŸ€– Initializing RAG pipelines...")
265
  vector_store = index_manager.get_store(collection_name)
266
 
267
  api_key = os.getenv("OPENAI_API_KEY")
 
276
  progress(1.0, desc="βœ… Complete!")
277
 
278
  stats_display = {
279
+ "βœ… Status": "Successfully indexed",
280
+ "πŸ“¦ Total Chunks": stats.get("chunks_added", 0),
281
+ "πŸ—‚οΈ Collection": collection_name,
282
+ "🏷️ Hierarchy": hierarchy_choice,
283
+ "🧠 Embedding Model": stats.get("model_name", "Unknown"),
284
+ "πŸ“Š Embedding Dimension": stats.get("embedding_dimension", 0),
285
+ "πŸ€– LLM Classification": "Enabled" if use_llm_classification else "Disabled"
286
  }
287
 
288
  status = f"""βœ… **Successfully indexed {stats.get('chunks_added', 0)} chunks!**
requirements.txt CHANGED
@@ -1,40 +1,35 @@
1
- # Core
2
- gradio>=4.44.0
3
- gradio_client>=0.18.0
4
- python-dotenv>=1.0.0
5
 
6
  # Document Processing
7
- PyPDF2>=3.0.0
8
- pyyaml>=6.0.1
9
 
10
  # Vector Database
11
- chromadb>=0.4.22
12
 
13
  # Embeddings & NLP
14
- torch==2.1.0
15
- transformers==4.35.0
16
- sentence-transformers==2.2.2
17
 
18
  # OpenAI
19
- openai>=1.0.0
20
 
21
- # Data Processing & Visualization
22
- pandas>=2.0.0
23
- numpy>=1.24.0
24
- matplotlib>=3.7.0
25
- seaborn>=0.12.0
26
 
27
- # Error Handling & Retry Logic
28
- tenacity>=8.2.0
 
29
 
30
- # Testing
31
- pytest>=7.4.0
32
- pytest-cov>=4.1.0
33
 
34
  # MCP Server
35
- fastapi>=0.104.0
36
- uvicorn>=0.24.0
37
- pydantic>=2.0.0
38
 
39
  # Utilities
40
- tiktoken>=0.5.0
 
1
+ # Core - Minimal for HF Spaces
2
+ gradio==4.44.0
3
+ python-dotenv
 
4
 
5
  # Document Processing
6
+ PyPDF2
7
+ pyyaml
8
 
9
  # Vector Database
10
+ chromadb
11
 
12
  # Embeddings & NLP
13
+ sentence-transformers
 
 
14
 
15
  # OpenAI
16
+ openai
17
 
18
+ # Data Processing
19
+ pandas
20
+ numpy<2.0.0
 
 
21
 
22
+ # Visualization (optional)
23
+ matplotlib
24
+ seaborn
25
 
26
+ # Error Handling
27
+ tenacity
 
28
 
29
  # MCP Server
30
+ fastapi
31
+ uvicorn
32
+ pydantic
33
 
34
  # Utilities
35
+ tiktoken