TilanB commited on
Commit
c23a6c5
·
verified ·
1 Parent(s): 8c63c58

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +127 -70
main.py CHANGED
@@ -19,7 +19,7 @@ from content_analyzer.document_parser import DocumentProcessor
19
  from search_engine.indexer import RetrieverBuilder
20
  from intelligence.orchestrator import AgentWorkflow
21
  from configuration import definitions, parameters
22
-
23
 
24
  # Rate limiting configuration - 3 requests per hour per IP
25
  WINDOW_S = 3600
@@ -31,17 +31,17 @@ def rate_limit(request):
31
  """Thread-safe rate limiting per IP address."""
32
  ip = getattr(request.client, "host", "unknown")
33
  now = time.time()
34
-
35
  with _calls_lock:
36
  q = _calls[ip]
37
  # Remove expired entries
38
  while q and (now - q[0]) > WINDOW_S:
39
  q.popleft()
40
-
41
  if len(q) >= MAX_CALLS:
42
  import gradio as gr
43
  raise gr.Error(f"Rate limit: {MAX_CALLS} requests per {WINDOW_S//60} minutes. Please wait.")
44
-
45
  q.append(now)
46
 
47
 
@@ -66,14 +66,14 @@ def format_chat_history(history: List[Dict]) -> str:
66
  """Format chat history as markdown for display."""
67
  if not history:
68
  return "*No conversation history yet. Ask a question to get started!*"
69
-
70
  formatted = []
71
  for i, entry in enumerate(history, 1):
72
  timestamp = entry.get("timestamp", "")
73
  question = entry.get("question", "")
74
  answer = entry.get("answer", "")
75
  confidence = entry.get("confidence", "N/A")
76
-
77
  formatted.append(f"""
78
  ---
79
  ### 💬 Q{i} ({timestamp})
@@ -83,7 +83,7 @@ def format_chat_history(history: List[Dict]) -> str:
83
 
84
  *Confidence: {confidence}*
85
  """)
86
-
87
  return "\n".join(formatted)
88
 
89
 
@@ -91,19 +91,19 @@ def format_document_context(documents: List, question: str = "") -> str:
91
  """Format retrieved documents with annotation highlighting."""
92
  if not documents:
93
  return "*No documents retrieved yet.*"
94
-
95
  formatted = [f"### 📚 Retrieved Context ({len(documents)} chunks)\n"]
96
-
97
  # Extract key terms from question for highlighting
98
  key_terms = []
99
  if question:
100
  stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'what', 'how', 'why', 'when', 'where', 'which'}
101
  key_terms = [word.lower() for word in question.split() if word.lower() not in stopwords and len(word) > 2]
102
-
103
  for i, doc in enumerate(documents[:5], 1):
104
  content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
105
  source = doc.metadata.get('source', 'Unknown') if hasattr(doc, 'metadata') else 'Unknown'
106
-
107
  # Truncate long content
108
  if len(content) > 500:
109
  content = content[:500] + "..."
@@ -114,7 +114,7 @@ def format_document_context(documents: List, question: str = "") -> str:
114
  import re
115
  pattern = re.compile(re.escape(term), re.IGNORECASE)
116
  highlighted_content = pattern.sub(f"**{term}**", highlighted_content)
117
-
118
  formatted.append(f"""
119
  <details>
120
  <summary>📄 Chunk {i} - {os.path.basename(source)}</summary>
@@ -123,10 +123,10 @@ def format_document_context(documents: List, question: str = "") -> str:
123
 
124
  </details>
125
  """)
126
-
127
  if len(documents) > 5:
128
  formatted.append(f"\n*... and {len(documents) - 5} more chunks*")
129
-
130
  return "\n".join(formatted)
131
 
132
 
@@ -186,16 +186,16 @@ def main():
186
  _ensure_hfhub_hffolder_compat() # must run before importing gradio
187
  import gradio as gr
188
  _setup_gradio_shim()
189
-
190
  logger.info("=" * 60)
191
  logger.info("Starting SmartDoc AI application...")
192
  logger.info("=" * 60)
193
-
194
  # Initialize components
195
  processor = DocumentProcessor()
196
  retriever_indexer = RetrieverBuilder()
197
  orchestrator = AgentWorkflow()
198
-
199
  logger.info("All components initialized successfully")
200
 
201
  # CSS styling - Clean, accessible light theme with professional colors
@@ -205,7 +205,7 @@ def main():
205
  background: linear-gradient(180deg, #f8fafc 0%, #e2e8f0 100%) !important;
206
  font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
207
  }
208
-
209
  /* Title styles - Dark text for readability */
210
  .app-title {
211
  font-size: 2.2em !important;
@@ -226,7 +226,7 @@ def main():
226
  font-size: 0.95em !important;
227
  line-height: 1.6 !important;
228
  }
229
-
230
  /* Section headers */
231
  .section-header {
232
  color: #1e293b !important;
@@ -235,7 +235,7 @@ def main():
235
  padding-bottom: 8px !important;
236
  margin-bottom: 16px !important;
237
  }
238
-
239
  /* Chat history panel - Clean white card with more height */
240
  .chat-history {
241
  min-height: 500px;
@@ -259,7 +259,7 @@ def main():
259
  .chat-history strong {
260
  color: #1e293b !important;
261
  }
262
-
263
  /* Document context panel */
264
  .doc-context {
265
  max-height: 380px;
@@ -286,7 +286,7 @@ def main():
286
  .doc-context p, .doc-context span {
287
  color: #475569 !important;
288
  }
289
-
290
  /* Answer box - Success green accent, auto-height */
291
  .answer-box > div:nth-child(2) {
292
  border-left: 4px solid #10b981 !important;
@@ -317,7 +317,7 @@ def main():
317
  border-radius: 6px !important;
318
  overflow-x: auto !important;
319
  }
320
-
321
  /* Verification box - Blue accent */
322
  .verification-box > div:nth-child(2) {
323
  border-left: 4px solid #0ea5e9 !important;
@@ -333,7 +333,7 @@ def main():
333
  .verification-box strong {
334
  color: #075985 !important;
335
  }
336
-
337
  /* Stats panel - Professional blue gradient */
338
  .stats-panel {
339
  background: linear-gradient(135deg, #0369a1 0%, #0284c7 50%, #0ea5e9 100%) !important;
@@ -346,7 +346,7 @@ def main():
346
  .stats-panel strong {
347
  color: #ffffff !important;
348
  }
349
-
350
  /* Info panel */
351
  .info-panel {
352
  background: #eff6ff !important;
@@ -355,7 +355,7 @@ def main():
355
  padding: 12px !important;
356
  color: #1e40af !important;
357
  }
358
-
359
  /* Form elements */
360
  .gr-input, .gr-textbox textarea {
361
  background: #ffffff !important;
@@ -367,13 +367,13 @@ def main():
367
  border-color: #0ea5e9 !important;
368
  box-shadow: 0 0 0 3px rgba(14, 165, 233, 0.1) !important;
369
  }
370
-
371
  /* Labels */
372
  label {
373
  color: #374151 !important;
374
  font-weight: 500 !important;
375
  }
376
-
377
  /* Dropdown - High contrast with darker background for visibility */
378
  .gr-dropdown,
379
  [data-testid="dropdown"],
@@ -398,7 +398,7 @@ def main():
398
  background: transparent !important;
399
  font-weight: 500 !important;
400
  }
401
-
402
  /* Dropdown container and options */
403
  [data-testid="dropdown"] span,
404
  .dropdown-container span,
@@ -406,7 +406,7 @@ def main():
406
  color: #1e293b !important;
407
  font-weight: 500 !important;
408
  }
409
-
410
  /* Dropdown list options */
411
  .gr-dropdown ul,
412
  .dropdown-options,
@@ -427,14 +427,14 @@ def main():
427
  background: #c7d2fe !important;
428
  color: #1e40af !important;
429
  }
430
-
431
  /* Dropdown label */
432
  .gr-dropdown label,
433
  [data-testid="dropdown"] label {
434
  color: #1e40af !important;
435
  font-weight: 600 !important;
436
  }
437
-
438
  /* Tabs - Clean styling */
439
  .tab-nav {
440
  border-bottom: 2px solid #e2e8f0 !important;
@@ -451,7 +451,7 @@ def main():
451
  border-bottom: 3px solid #0369a1 !important;
452
  font-weight: 600 !important;
453
  }
454
-
455
  /* Markdown text */
456
  .prose, .markdown-text {
457
  color: #334155 !important;
@@ -463,7 +463,7 @@ def main():
463
  .prose strong, .markdown-text strong {
464
  color: #0f172a !important;
465
  }
466
-
467
  /* Scrollbar styling */
468
  ::-webkit-scrollbar {
469
  width: 8px;
@@ -495,7 +495,7 @@ def main():
495
  background: #1d4ed8 !important;
496
  box-shadow: 0 4px 10px rgba(30, 64, 175, 0.4) !important;
497
  }
498
-
499
  /* Left side input boxes with borders */
500
  .left-panel-box {
501
  background: #fafafa !important;
@@ -508,7 +508,7 @@ def main():
508
  border-color: #64748b !important;
509
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1) !important;
510
  }
511
-
512
  /* File upload box with border */
513
  .file-upload-box {
514
  background: #f8fafc !important;
@@ -521,7 +521,7 @@ def main():
521
  border-style: solid !important;
522
  background: #f0f9ff !important;
523
  }
524
-
525
  /* Question input box with border */
526
  .question-box {
527
  background: #fffbeb !important;
@@ -631,7 +631,7 @@ setInterval(tick, 500);
631
  # Launch server - Compatible with both local and Hugging Face Spaces
632
  # HF Spaces sets SPACE_ID environment variable
633
  is_hf_space = os.environ.get("SPACE_ID") is not None
634
-
635
  with gr.Blocks(title="SmartDoc AI") as demo:
636
  gr.Markdown("### SmartDoc AI - Document Q&A", elem_classes="app-title")
637
  gr.Markdown("Upload your documents and ask questions. Answers will appear below, just like a chat.", elem_classes="app-description")
@@ -683,7 +683,7 @@ setInterval(tick, 500);
683
  )
684
  try:
685
  if not question_text.strip():
686
-
687
  chat_history.append({"role": "user", "content": question_text})
688
  chat_history.append({"role": "assistant", "content": "Please enter a question."})
689
  yield (
@@ -698,7 +698,7 @@ setInterval(tick, 500);
698
  )
699
  return
700
  if not uploaded_files:
701
-
702
  chat_history.append({"role": "user", "content": question_text})
703
  chat_history.append({"role": "assistant", "content": "Please upload at least one document."})
704
  yield (
@@ -824,7 +824,7 @@ setInterval(tick, 500);
824
  verification = result.get("verification_report", "No verification details available.")
825
  logger.info(f"Verification (internal):\n{verification}")
826
  # Do not display verification to user, only use internally
827
-
828
  chat_history.append({"role": "user", "content": question_text})
829
  chat_history.append({"role": "assistant", "content": f"**Answer:**\n{answer}"})
830
  session_state.value["last_documents"] = retriever.invoke(question_text)
@@ -853,8 +853,8 @@ setInterval(tick, 500);
853
  )
854
  except Exception as e:
855
  logger.error(f"Processing error: {e}", exc_info=True)
856
-
857
-
858
  chat_history.append({"role": "user", "content": question_text})
859
  chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
860
  yield (
@@ -897,37 +897,94 @@ setInterval(tick, 500);
897
  ex_data = EXAMPLES[example_key]
898
  question_text = ex_data["question"]
899
  file_names = ex_data["file_paths"]
900
-
901
  # Try to download from HF dataset if on Spaces
902
  if is_hf_space:
903
  try:
904
- from huggingface_hub import hf_hub_download
 
 
905
  copied_files = []
906
  file_info_text = f"✅ Loaded: {example_key}\n\n"
907
 
908
- # Get HF token for private dataset access (optional)
909
  hf_token = os.environ.get("HF_TOKEN", None)
910
 
911
- for file_path in file_names:
912
- filename = os.path.basename(file_path)
913
- try:
914
- local_path = hf_hub_download(
915
- repo_id="TilanB/smartdoc-samples", # Correct dataset repo
916
- repo_type="dataset",
917
- filename=filename,
918
- token=hf_token, # Pass token for private repos
919
- )
920
- copied_files.append(local_path)
921
- file_size_mb = os.path.getsize(local_path) / (1024 * 1024)
922
- file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
923
- except Exception as e:
924
- logger.error(f"Failed to download {filename}: {e}")
925
- file_info_text += f"❌ {filename} - Download failed\n"
926
- if not copied_files:
927
- return [], "", "❌ Could not load example files from dataset. Make sure the dataset is public or HF_TOKEN is set."
928
- return copied_files, question_text, file_info_text
929
- except ImportError:
930
- return [], "", "❌ huggingface_hub not installed"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
931
  else:
932
  # Local mode - use files from samples directory
933
  import tempfile
@@ -948,17 +1005,17 @@ setInterval(tick, 500);
948
  if not copied_files:
949
  return [], "", "Could not load example files"
950
  return copied_files, question_text, file_info_text
951
-
952
  example_dropdown.change(
953
  fn=load_example,
954
  inputs=[example_dropdown],
955
  outputs=[files, question, loaded_file_info]
956
  )
957
-
958
  # Show loaded_file_info when example is selected
959
  def show_info(example_key):
960
  return gr.update(visible=bool(example_key))
961
-
962
  example_dropdown.change(
963
  fn=show_info,
964
  inputs=[example_dropdown],
@@ -967,7 +1024,7 @@ setInterval(tick, 500);
967
  # Launch server - Compatible with both local and Hugging Face Spaces
968
  # HF Spaces sets SPACE_ID environment variable
969
  is_hf_space = os.environ.get("SPACE_ID") is not None
970
-
971
  demo.queue()
972
  if is_hf_space:
973
  # Hugging Face Spaces configuration
 
19
  from search_engine.indexer import RetrieverBuilder
20
  from intelligence.orchestrator import AgentWorkflow
21
  from configuration import definitions, parameters
22
+
23
 
24
  # Rate limiting configuration - 3 requests per hour per IP
25
  WINDOW_S = 3600
 
31
  """Thread-safe rate limiting per IP address."""
32
  ip = getattr(request.client, "host", "unknown")
33
  now = time.time()
34
+
35
  with _calls_lock:
36
  q = _calls[ip]
37
  # Remove expired entries
38
  while q and (now - q[0]) > WINDOW_S:
39
  q.popleft()
40
+
41
  if len(q) >= MAX_CALLS:
42
  import gradio as gr
43
  raise gr.Error(f"Rate limit: {MAX_CALLS} requests per {WINDOW_S//60} minutes. Please wait.")
44
+
45
  q.append(now)
46
 
47
 
 
66
  """Format chat history as markdown for display."""
67
  if not history:
68
  return "*No conversation history yet. Ask a question to get started!*"
69
+
70
  formatted = []
71
  for i, entry in enumerate(history, 1):
72
  timestamp = entry.get("timestamp", "")
73
  question = entry.get("question", "")
74
  answer = entry.get("answer", "")
75
  confidence = entry.get("confidence", "N/A")
76
+
77
  formatted.append(f"""
78
  ---
79
  ### 💬 Q{i} ({timestamp})
 
83
 
84
  *Confidence: {confidence}*
85
  """)
86
+
87
  return "\n".join(formatted)
88
 
89
 
 
91
  """Format retrieved documents with annotation highlighting."""
92
  if not documents:
93
  return "*No documents retrieved yet.*"
94
+
95
  formatted = [f"### 📚 Retrieved Context ({len(documents)} chunks)\n"]
96
+
97
  # Extract key terms from question for highlighting
98
  key_terms = []
99
  if question:
100
  stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'what', 'how', 'why', 'when', 'where', 'which'}
101
  key_terms = [word.lower() for word in question.split() if word.lower() not in stopwords and len(word) > 2]
102
+
103
  for i, doc in enumerate(documents[:5], 1):
104
  content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
105
  source = doc.metadata.get('source', 'Unknown') if hasattr(doc, 'metadata') else 'Unknown'
106
+
107
  # Truncate long content
108
  if len(content) > 500:
109
  content = content[:500] + "..."
 
114
  import re
115
  pattern = re.compile(re.escape(term), re.IGNORECASE)
116
  highlighted_content = pattern.sub(f"**{term}**", highlighted_content)
117
+
118
  formatted.append(f"""
119
  <details>
120
  <summary>📄 Chunk {i} - {os.path.basename(source)}</summary>
 
123
 
124
  </details>
125
  """)
126
+
127
  if len(documents) > 5:
128
  formatted.append(f"\n*... and {len(documents) - 5} more chunks*")
129
+
130
  return "\n".join(formatted)
131
 
132
 
 
186
  _ensure_hfhub_hffolder_compat() # must run before importing gradio
187
  import gradio as gr
188
  _setup_gradio_shim()
189
+
190
  logger.info("=" * 60)
191
  logger.info("Starting SmartDoc AI application...")
192
  logger.info("=" * 60)
193
+
194
  # Initialize components
195
  processor = DocumentProcessor()
196
  retriever_indexer = RetrieverBuilder()
197
  orchestrator = AgentWorkflow()
198
+
199
  logger.info("All components initialized successfully")
200
 
201
  # CSS styling - Clean, accessible light theme with professional colors
 
205
  background: linear-gradient(180deg, #f8fafc 0%, #e2e8f0 100%) !important;
206
  font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
207
  }
208
+
209
  /* Title styles - Dark text for readability */
210
  .app-title {
211
  font-size: 2.2em !important;
 
226
  font-size: 0.95em !important;
227
  line-height: 1.6 !important;
228
  }
229
+
230
  /* Section headers */
231
  .section-header {
232
  color: #1e293b !important;
 
235
  padding-bottom: 8px !important;
236
  margin-bottom: 16px !important;
237
  }
238
+
239
  /* Chat history panel - Clean white card with more height */
240
  .chat-history {
241
  min-height: 500px;
 
259
  .chat-history strong {
260
  color: #1e293b !important;
261
  }
262
+
263
  /* Document context panel */
264
  .doc-context {
265
  max-height: 380px;
 
286
  .doc-context p, .doc-context span {
287
  color: #475569 !important;
288
  }
289
+
290
  /* Answer box - Success green accent, auto-height */
291
  .answer-box > div:nth-child(2) {
292
  border-left: 4px solid #10b981 !important;
 
317
  border-radius: 6px !important;
318
  overflow-x: auto !important;
319
  }
320
+
321
  /* Verification box - Blue accent */
322
  .verification-box > div:nth-child(2) {
323
  border-left: 4px solid #0ea5e9 !important;
 
333
  .verification-box strong {
334
  color: #075985 !important;
335
  }
336
+
337
  /* Stats panel - Professional blue gradient */
338
  .stats-panel {
339
  background: linear-gradient(135deg, #0369a1 0%, #0284c7 50%, #0ea5e9 100%) !important;
 
346
  .stats-panel strong {
347
  color: #ffffff !important;
348
  }
349
+
350
  /* Info panel */
351
  .info-panel {
352
  background: #eff6ff !important;
 
355
  padding: 12px !important;
356
  color: #1e40af !important;
357
  }
358
+
359
  /* Form elements */
360
  .gr-input, .gr-textbox textarea {
361
  background: #ffffff !important;
 
367
  border-color: #0ea5e9 !important;
368
  box-shadow: 0 0 0 3px rgba(14, 165, 233, 0.1) !important;
369
  }
370
+
371
  /* Labels */
372
  label {
373
  color: #374151 !important;
374
  font-weight: 500 !important;
375
  }
376
+
377
  /* Dropdown - High contrast with darker background for visibility */
378
  .gr-dropdown,
379
  [data-testid="dropdown"],
 
398
  background: transparent !important;
399
  font-weight: 500 !important;
400
  }
401
+
402
  /* Dropdown container and options */
403
  [data-testid="dropdown"] span,
404
  .dropdown-container span,
 
406
  color: #1e293b !important;
407
  font-weight: 500 !important;
408
  }
409
+
410
  /* Dropdown list options */
411
  .gr-dropdown ul,
412
  .dropdown-options,
 
427
  background: #c7d2fe !important;
428
  color: #1e40af !important;
429
  }
430
+
431
  /* Dropdown label */
432
  .gr-dropdown label,
433
  [data-testid="dropdown"] label {
434
  color: #1e40af !important;
435
  font-weight: 600 !important;
436
  }
437
+
438
  /* Tabs - Clean styling */
439
  .tab-nav {
440
  border-bottom: 2px solid #e2e8f0 !important;
 
451
  border-bottom: 3px solid #0369a1 !important;
452
  font-weight: 600 !important;
453
  }
454
+
455
  /* Markdown text */
456
  .prose, .markdown-text {
457
  color: #334155 !important;
 
463
  .prose strong, .markdown-text strong {
464
  color: #0f172a !important;
465
  }
466
+
467
  /* Scrollbar styling */
468
  ::-webkit-scrollbar {
469
  width: 8px;
 
495
  background: #1d4ed8 !important;
496
  box-shadow: 0 4px 10px rgba(30, 64, 175, 0.4) !important;
497
  }
498
+
499
  /* Left side input boxes with borders */
500
  .left-panel-box {
501
  background: #fafafa !important;
 
508
  border-color: #64748b !important;
509
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1) !important;
510
  }
511
+
512
  /* File upload box with border */
513
  .file-upload-box {
514
  background: #f8fafc !important;
 
521
  border-style: solid !important;
522
  background: #f0f9ff !important;
523
  }
524
+
525
  /* Question input box with border */
526
  .question-box {
527
  background: #fffbeb !important;
 
631
  # Launch server - Compatible with both local and Hugging Face Spaces
632
  # HF Spaces sets SPACE_ID environment variable
633
  is_hf_space = os.environ.get("SPACE_ID") is not None
634
+
635
  with gr.Blocks(title="SmartDoc AI") as demo:
636
  gr.Markdown("### SmartDoc AI - Document Q&A", elem_classes="app-title")
637
  gr.Markdown("Upload your documents and ask questions. Answers will appear below, just like a chat.", elem_classes="app-description")
 
683
  )
684
  try:
685
  if not question_text.strip():
686
+
687
  chat_history.append({"role": "user", "content": question_text})
688
  chat_history.append({"role": "assistant", "content": "Please enter a question."})
689
  yield (
 
698
  )
699
  return
700
  if not uploaded_files:
701
+
702
  chat_history.append({"role": "user", "content": question_text})
703
  chat_history.append({"role": "assistant", "content": "Please upload at least one document."})
704
  yield (
 
824
  verification = result.get("verification_report", "No verification details available.")
825
  logger.info(f"Verification (internal):\n{verification}")
826
  # Do not display verification to user, only use internally
827
+
828
  chat_history.append({"role": "user", "content": question_text})
829
  chat_history.append({"role": "assistant", "content": f"**Answer:**\n{answer}"})
830
  session_state.value["last_documents"] = retriever.invoke(question_text)
 
853
  )
854
  except Exception as e:
855
  logger.error(f"Processing error: {e}", exc_info=True)
856
+
857
+
858
  chat_history.append({"role": "user", "content": question_text})
859
  chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
860
  yield (
 
897
  ex_data = EXAMPLES[example_key]
898
  question_text = ex_data["question"]
899
  file_names = ex_data["file_paths"]
900
+
901
  # Try to download from HF dataset if on Spaces
902
  if is_hf_space:
903
  try:
904
+ from datasets import load_dataset
905
+ import tempfile
906
+
907
  copied_files = []
908
  file_info_text = f"✅ Loaded: {example_key}\n\n"
909
 
910
+ # Get HF token (optional for public datasets)
911
  hf_token = os.environ.get("HF_TOKEN", None)
912
 
913
+ try:
914
+ # Load dataset - uses row-based structure
915
+ logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
916
+ ds = load_dataset(
917
+ "TilanB/smartdoc-samples",
918
+ split="train",
919
+ token=hf_token
920
+ )
921
+ logger.info(f"Dataset loaded with {len(ds)} rows")
922
+
923
+ # Create temp directory for files
924
+ temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
925
+
926
+ # Extract requested files from dataset rows
927
+ for file_path in file_names:
928
+ filename = os.path.basename(file_path)
929
+ file_found = False
930
+
931
+ # Search through dataset rows
932
+ for row in ds:
933
+ # Check if this row contains our file
934
+ # Adjust field names based on your dataset structure
935
+ row_filename = row.get('filename') or row.get('name') or row.get('path', '')
936
+
937
+ if os.path.basename(row_filename) == filename:
938
+ temp_file_path = os.path.join(temp_dir, filename)
939
+
940
+ # Handle different dataset column formats
941
+ if 'content' in row and row['content']:
942
+ # Binary content stored directly
943
+ with open(temp_file_path, 'wb') as f:
944
+ f.write(row['content'])
945
+ elif 'file' in row and row['file']:
946
+ # File object with bytes
947
+ file_obj = row['file']
948
+ if isinstance(file_obj, dict) and 'bytes' in file_obj:
949
+ with open(temp_file_path, 'wb') as f:
950
+ f.write(file_obj['bytes'])
951
+ elif isinstance(file_obj, bytes):
952
+ with open(temp_file_path, 'wb') as f:
953
+ f.write(file_obj)
954
+ elif 'data' in row and row['data']:
955
+ # Raw data field
956
+ with open(temp_file_path, 'wb') as f:
957
+ f.write(row['data'])
958
+ else:
959
+ logger.warning(f"Unknown dataset format for {filename}, available fields: {list(row.keys())}")
960
+ continue
961
+
962
+ copied_files.append(temp_file_path)
963
+ file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
964
+ file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
965
+ file_found = True
966
+ logger.info(f"Successfully extracted {filename} from dataset")
967
+ break
968
+
969
+ if not file_found:
970
+ logger.warning(f"File {filename} not found in dataset rows")
971
+ file_info_text += f"⚠️ {filename} - Not found in dataset\n"
972
+
973
+ if not copied_files:
974
+ # Log dataset structure for debugging
975
+ if len(ds) > 0:
976
+ logger.error(f"Dataset structure: {list(ds[0].keys())}")
977
+ return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please check dataset structure or upload files manually."
978
+
979
+ return copied_files, question_text, file_info_text
980
+
981
+ except Exception as e:
982
+ logger.error(f"Failed to load dataset: {e}", exc_info=True)
983
+ return [], "", f"❌ Failed to load dataset: {str(e)}\n\nPlease upload files manually."
984
+
985
+ except ImportError as e:
986
+ logger.error(f"datasets package not installed: {e}")
987
+ return [], "", "❌ 'datasets' package not installed"
988
  else:
989
  # Local mode - use files from samples directory
990
  import tempfile
 
1005
  if not copied_files:
1006
  return [], "", "Could not load example files"
1007
  return copied_files, question_text, file_info_text
1008
+
1009
  example_dropdown.change(
1010
  fn=load_example,
1011
  inputs=[example_dropdown],
1012
  outputs=[files, question, loaded_file_info]
1013
  )
1014
+
1015
  # Show loaded_file_info when example is selected
1016
  def show_info(example_key):
1017
  return gr.update(visible=bool(example_key))
1018
+
1019
  example_dropdown.change(
1020
  fn=show_info,
1021
  inputs=[example_dropdown],
 
1024
  # Launch server - Compatible with both local and Hugging Face Spaces
1025
  # HF Spaces sets SPACE_ID environment variable
1026
  is_hf_space = os.environ.get("SPACE_ID") is not None
1027
+
1028
  demo.queue()
1029
  if is_hf_space:
1030
  # Hugging Face Spaces configuration