msmaje commited on
Commit
8442587
·
verified ·
1 Parent(s): 684c9d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -447
app.py CHANGED
@@ -10,6 +10,7 @@ import zipfile
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
 
13
  try:
14
  from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -31,7 +32,7 @@ try:
31
  except ImportError:
32
  try:
33
  from langchain_huggingface import HuggingFaceEndpoint
34
- HUGGINGFACE_HUB_AVAILABLE = False
35
  logger.info("Using HuggingFaceEndpoint as fallback")
36
  except ImportError:
37
  logger.error("No suitable HuggingFace LLM implementation found")
@@ -69,7 +70,7 @@ def initialize_models():
69
  # Get HuggingFace token from environment
70
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
71
  if not hf_token:
72
- return False, "❌ HuggingFace API token not found in environment variables"
73
 
74
  return True, "✅ Models initialized successfully"
75
 
@@ -82,7 +83,7 @@ def create_llm():
82
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
83
 
84
  if not hf_token:
85
- logger.error("HuggingFace API token not found")
86
  return create_fallback_llm()
87
 
88
  try:
@@ -116,7 +117,7 @@ def create_llm():
116
  logger.warning(f"Failed to initialize {model_id} with HuggingFaceHub: {model_error}")
117
  continue
118
 
119
- # Fallback to HuggingFaceEndpoint if HuggingFaceHub is not available
120
  try:
121
  from langchain_huggingface import HuggingFaceEndpoint
122
 
@@ -147,10 +148,10 @@ def create_llm():
147
  logger.warning(f"Failed to initialize {model_id} with HuggingFaceEndpoint: {model_error}")
148
  continue
149
  except ImportError:
150
- pass
151
 
152
  # If all else fails, return fallback
153
- raise Exception("All model initialization attempts failed")
154
 
155
  except Exception as e:
156
  logger.error(f"LLM creation error: {e}")
@@ -195,7 +196,7 @@ def create_fallback_llm():
195
  def invoke(self, prompt):
196
  return "System temporarily unavailable. Please try again later."
197
 
198
- def __call__(self, prompt):
199
  return self.invoke(prompt)
200
 
201
  return SimpleFallback()
@@ -222,7 +223,7 @@ def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
222
  documents = loader.load()
223
 
224
  if not documents:
225
- return "❌ No documents were loaded from the PDFs folder."
226
 
227
  # Split documents into chunks
228
  text_splitter = RecursiveCharacterTextSplitter(
@@ -269,12 +270,12 @@ Helpful Answer:
269
  test_result = retrieval_qa({"query": "test"})
270
  logger.info("QA chain test successful")
271
  except Exception as test_error:
272
- logger.warning(f"QA chain test failed: {test_error}")
273
  # Chain created but might have issues - continue anyway
274
 
275
  except Exception as chain_error:
276
  logger.error(f"Chain creation error: {chain_error}")
277
- return f"❌ Error creating QA chain: {str(chain_error)}"
278
 
279
  pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
280
  return f"✅ Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
@@ -302,19 +303,18 @@ def extract_zip_to_pdfs(zip_file):
302
 
303
  for pdf_file in pdf_files:
304
  # Extract to PDFs folder
305
- zip_ref.extract(pdf_file, PDF_FOLDER_PATH)
 
 
 
 
 
 
 
 
 
 
306
 
307
- # If file is in a subfolder, move it to the root of PDFs folder
308
- extracted_path = os.path.join(PDF_FOLDER_PATH, pdf_file)
309
- if os.path.dirname(pdf_file): # File is in a subfolder
310
- new_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
311
- shutil.move(extracted_path, new_path)
312
- # Clean up empty directories
313
- try:
314
- os.rmdir(os.path.dirname(extracted_path))
315
- except:
316
- pass
317
-
318
  global PRELOADED_PDFS
319
  PRELOADED_PDFS = True
320
 
@@ -401,11 +401,11 @@ Helpful Answer:
401
  test_result = retrieval_qa({"query": "test"})
402
  logger.info("QA chain test successful")
403
  except Exception as test_error:
404
- logger.warning(f"QA chain test failed: {test_error}")
405
 
406
  except Exception as chain_error:
407
  logger.error(f"Chain creation error: {chain_error}")
408
- return f"❌ Error creating QA chain: {str(chain_error)}"
409
 
410
  # Clean up temp directory
411
  shutil.rmtree(temp_dir)
@@ -427,7 +427,7 @@ def answer_question(question):
427
  return "❌ Please upload and process PDF files first.", ""
428
 
429
  try:
430
- # Get answer from RAG system with timeout and error handling
431
  result = retrieval_qa({"query": question})
432
 
433
  answer = result.get("result", "No answer generated")
@@ -448,7 +448,7 @@ def answer_question(question):
448
  except Exception as e:
449
  logger.error(f"Question answering error: {e}")
450
 
451
- # Provide a fallback response using just the retriever
452
  try:
453
  if vectorstore is not None:
454
  # Get relevant documents directly from vectorstore
@@ -469,11 +469,11 @@ def answer_question(question):
469
 
470
  return fallback_answer + "\n*Note: This is a direct search result due to a technical issue with the AI model.*", sources_text
471
  else:
472
- return f"❌ Error answering question: {str(e)}", ""
473
 
474
  except Exception as fallback_error:
475
- logger.error(f"Fallback error: {fallback_error}")
476
- return f"❌ Error answering question: {str(e)}", ""
477
 
478
  def create_interface():
479
  """Create the fully responsive Gradio interface"""
@@ -530,6 +530,28 @@ def create_interface():
530
  min-width: 0 !important;
531
  }
532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  /* Mobile-first responsive breakpoints */
534
 
535
  /* Small devices (phones, 320px and up) */
@@ -593,6 +615,24 @@ def create_interface():
593
  .gr-accordion {
594
  border-radius: var(--radius-md) !important;
595
  border: 1px solid var(--border-color) !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  }
597
 
598
  /* Slider improvements */
@@ -634,11 +674,16 @@ def create_interface():
634
 
635
  /* Two-column layout for medium screens */
636
  .gr-column:first-child {
637
- flex: 0 0 35% !important;
 
638
  }
639
 
640
  .gr-column:last-child {
641
- flex: 0 0 60% !important;
 
 
 
 
642
  }
643
  }
644
 
@@ -667,7 +712,8 @@ def create_interface():
667
 
668
  /* Optimal desktop layout */
669
  .gr-column:first-child {
670
- flex: 0 0 400px !important;
 
671
  }
672
 
673
  .gr-column:last-child {
@@ -785,7 +831,7 @@ def create_interface():
785
 
786
  .gr-textbox textarea:focus,
787
  .gr-textbox input:focus {
788
- border-color: var(--primary-color) !important;
789
  outline: none !important;
790
  box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1) !important;
791
  }
@@ -882,443 +928,107 @@ def create_interface():
882
  background: var(--text-secondary) !important;
883
  }
884
 
885
- /* Animation classes */
886
- .fade-in {
887
- animation: fadeIn 0.3s ease-in-out !important;
888
- }
889
-
890
- @keyframes fadeIn {
891
- from { opacity: 0; transform: translateY(10px); }
892
- to { opacity: 1; transform: translateY(0); }
893
- }
894
-
895
- /* Accessibility improvements */
896
- .gr-button:focus-visible,
897
- .gr-textbox input:focus-visible,
898
- .gr-textbox textarea:focus-visible {
899
- outline: 2px solid var(--primary-color) !important;
900
- outline-offset: 2px !important;
901
- }
902
-
903
- /* Print styles */
904
- @media print {
905
- .gr-button,
906
- .gr-file,
907
- .gr-slider {
908
- display: none !important;
909
- }
910
-
911
- .gr-textbox textarea,
912
- .gr-textbox input {
913
- border: 1px solid #000 !important;
914
- background: white !important;
915
- }
916
- }
917
-
918
- /* High contrast mode support */
919
- @media (prefers-contrast: high) {
920
- :root {
921
- --border-color: #000000;
922
- --text-primary: #000000;
923
- --text-secondary: #333333;
924
- --bg-primary: #ffffff;
925
- --bg-secondary: #f0f0f0;
926
- }
927
- }
928
-
929
- /* Reduced motion support */
930
- @media (prefers-reduced-motion: reduce) {
931
- * {
932
- animation-duration: 0.01ms !important;
933
- animation-iteration-count: 1 !important;
934
- transition-duration: 0.01ms !important;
935
- }
936
- }
937
-
938
- /* Error and success states */
939
- .gr-textbox.error textarea,
940
- .gr-textbox.error input {
941
- border-color: #ef4444 !important;
942
- background: rgba(239, 68, 68, 0.05) !important;
943
- }
944
-
945
- .gr-textbox.success textarea,
946
- .gr-textbox.success input {
947
- border-color: var(--secondary-color) !important;
948
- background: rgba(16, 185, 129, 0.05) !important;
949
- }
950
-
951
- /* Custom status messages */
952
- .status-message {
953
- padding: 0.75rem 1rem !important;
954
- border-radius: var(--radius-md) !important;
955
- margin: 0.5rem 0 !important;
956
- font-size: 0.875rem !important;
957
- font-weight: 500 !important;
958
  }
 
959
 
960
- .status-success {
961
- background: rgba(16, 185, 129, 0.1) !important;
962
- color: #059669 !important;
963
- border: 1px solid rgba(16, 185, 129, 0.2) !important;
964
- }
 
 
 
 
 
 
 
 
965
 
966
- .status-error {
967
- background: rgba(239, 68, 68, 0.1) !important;
968
- color: #dc2626 !important;
969
- border: 1px solid rgba(239, 68, 68, 0.2) !important;
970
- }
 
 
 
971
 
972
- .status-warning {
973
- background: rgba(245, 158, 11, 0.1) !important;
974
- color: #d97706 !important;
975
- border: 1px solid rgba(245, 158, 11, 0.2) !important;
976
- }
977
 
978
- /* Enhanced focus styles for accessibility */
979
- .gr-button:focus,
980
- .gr-textbox input:focus,
981
- .gr-textbox textarea:focus,
982
- .gr-file:focus {
983
- outline: 2px solid var(--primary-color) !important;
984
- outline-offset: 2px !important;
985
- }
986
 
987
- /* Custom scrollable areas */
988
- .scrollable-content {
989
- max-height: 400px !important;
990
- overflow-y: auto !important;
991
- padding: 1rem !important;
992
- background: var(--bg-secondary) !important;
993
- border-radius: var(--radius-md) !important;
994
- border: 1px solid var(--border-color) !important;
995
- }
996
- """
997
-
998
- # Create the interface
999
- with gr.Blocks(css=custom_css, title="📚 RAG PDF Chat Interface", theme=gr.themes.Soft()) as interface:
1000
-
1001
- # Header
1002
- gr.Markdown("""
1003
- # 📚 RAG PDF Chat Interface
1004
-
1005
- **Upload PDF documents and ask questions about their content using advanced AI**
1006
-
1007
- This interface allows you to:
1008
- - Upload PDF files or ZIP archives containing PDFs
1009
- - Process documents using state-of-the-art text chunking and embedding techniques
1010
- - Ask questions about your documents using natural language
1011
- - Get accurate answers with source citations
1012
- """)
1013
-
1014
- # Main interface layout
1015
- with gr.Row():
1016
- # Left column - Controls
1017
- with gr.Column(scale=1):
1018
-
1019
- # Pre-loaded PDFs section
1020
- with gr.Accordion("📁 Pre-loaded PDFs", open=PRELOADED_PDFS):
1021
- gr.Markdown("""
1022
- **Option 1: Use pre-existing PDFs**
1023
-
1024
- If you have PDFs in the `./pdfs` folder, click the button below to process them.
1025
- """)
1026
-
1027
- preload_btn = gr.Button(
1028
- "🔄 Load Pre-existing PDFs",
1029
- variant="secondary",
1030
- size="sm"
1031
- )
1032
- preload_status = gr.Textbox(
1033
- label="Pre-load Status",
1034
- interactive=False,
1035
- lines=2
1036
- )
1037
-
1038
- # ZIP upload section
1039
- with gr.Accordion("📦 Upload ZIP Archive", open=False):
1040
- gr.Markdown("""
1041
- **Option 2: Upload ZIP containing PDFs**
1042
-
1043
- Upload a ZIP file containing PDF documents. They will be extracted to the PDFs folder.
1044
- """)
1045
-
1046
- zip_file = gr.File(
1047
- label="Upload ZIP Archive",
1048
- file_types=[".zip"],
1049
- file_count="single"
1050
- )
1051
- zip_btn = gr.Button(
1052
- "📦 Extract ZIP to PDFs",
1053
- variant="secondary",
1054
- size="sm"
1055
- )
1056
- zip_status = gr.Textbox(
1057
- label="ZIP Status",
1058
- interactive=False,
1059
- lines=2
1060
- )
1061
-
1062
- # Direct PDF upload section
1063
- with gr.Accordion("📄 Upload PDF Files", open=True):
1064
- gr.Markdown("""
1065
- **Option 3: Direct PDF upload**
1066
-
1067
- Upload PDF files directly for processing.
1068
- """)
1069
-
1070
- pdf_files = gr.File(
1071
- label="Upload PDF Files",
1072
- file_types=[".pdf"],
1073
- file_count="multiple"
1074
- )
1075
-
1076
- # Processing parameters
1077
  with gr.Accordion("⚙️ Processing Parameters", open=False):
1078
- gr.Markdown("""
1079
- **Advanced Settings**
1080
-
1081
- Adjust these parameters to optimize document processing for your specific needs.
1082
- """)
1083
-
1084
- chunk_size = gr.Slider(
1085
- minimum=500,
1086
  maximum=2000,
1087
  value=1000,
1088
- step=100,
1089
  label="Chunk Size",
1090
- info="Size of text chunks for processing (larger = more context, smaller = more precise)"
1091
  )
1092
-
1093
- chunk_overlap = gr.Slider(
1094
  minimum=0,
1095
  maximum=500,
1096
  value=200,
1097
- step=50,
1098
  label="Chunk Overlap",
1099
- info="Overlap between chunks (helps maintain context across boundaries)"
1100
- )
1101
-
1102
- # Process button
1103
- process_btn = gr.Button(
1104
- "🚀 Process Documents",
1105
- variant="primary",
1106
- size="lg"
1107
- )
1108
-
1109
- # Status display
1110
- status_output = gr.Textbox(
1111
- label="Processing Status",
1112
- interactive=False,
1113
- lines=4
1114
- )
1115
-
1116
- # Right column - Chat interface
1117
- with gr.Column(scale=2):
1118
-
1119
- # Chat interface
1120
- with gr.Tab("💬 Chat with Documents"):
1121
- gr.Markdown("""
1122
- **Ask questions about your documents**
1123
-
1124
- Once you've processed your PDFs, you can ask questions about their content.
1125
- The AI will provide answers based on the information in your documents.
1126
- """)
1127
-
1128
- # Question input
1129
- question_input = gr.Textbox(
1130
- label="Ask a question about your documents",
1131
- placeholder="e.g., What is the main topic discussed in the document?",
1132
- lines=2
1133
- )
1134
-
1135
- # Ask button
1136
- ask_btn = gr.Button(
1137
- "🔍 Ask Question",
1138
- variant="primary",
1139
- size="lg"
1140
  )
1141
-
1142
- # Answer display
1143
- with gr.Row():
1144
- with gr.Column():
1145
- answer_output = gr.Textbox(
1146
- label="Answer",
1147
- interactive=False,
1148
- lines=8
1149
- )
1150
-
1151
- with gr.Column():
1152
- sources_output = gr.Textbox(
1153
- label="Sources & References",
1154
- interactive=False,
1155
- lines=8
1156
- )
1157
-
1158
- # Help tab
1159
- with gr.Tab("❓ Help & Tips"):
1160
- gr.Markdown("""
1161
- ## 🔧 How to Use This Interface
1162
-
1163
- ### Step 1: Upload Documents
1164
- Choose one of three options:
1165
- - **Pre-loaded PDFs**: Use documents already in the `./pdfs` folder
1166
- - **ZIP Archive**: Upload a ZIP file containing multiple PDFs
1167
- - **Direct Upload**: Upload PDF files directly
1168
-
1169
- ### Step 2: Process Documents
1170
- Click "Process Documents" to:
1171
- - Extract text from PDFs
1172
- - Split text into manageable chunks
1173
- - Create embeddings for semantic search
1174
- - Set up the question-answering system
1175
-
1176
- ### Step 3: Ask Questions
1177
- Once processing is complete, you can:
1178
- - Ask specific questions about document content
1179
- - Get answers with source citations
1180
- - Explore different aspects of your documents
1181
-
1182
- ## 💡 Tips for Better Results
1183
-
1184
- ### Question Formatting
1185
- - **Good**: "What are the main findings about climate change?"
1186
- - **Better**: "What specific evidence does the document provide about climate change impacts?"
1187
- - **Best**: "According to the research, what are the three most significant climate change impacts on agriculture?"
1188
-
1189
- ### Document Preparation
1190
- - Use high-quality, text-based PDFs (not scanned images)
1191
- - Ensure documents are well-structured with clear headings
1192
- - Remove unnecessary pages to improve processing speed
1193
-
1194
- ### Processing Parameters
1195
- - **Chunk Size**:
1196
- - Larger (1500-2000): Better for broad context questions
1197
- - Smaller (500-1000): Better for specific detail questions
1198
- - **Chunk Overlap**:
1199
- - More overlap (200-300): Better context continuity
1200
- - Less overlap (0-100): Faster processing
1201
-
1202
- ## 🚨 Troubleshooting
1203
-
1204
- ### Common Issues
1205
- - **"No documents loaded"**: Check PDF file format and quality
1206
- - **"Model initialization failed"**: Verify HuggingFace token is set
1207
- - **"Processing timeout"**: Try smaller chunk sizes or fewer documents
1208
- - **"Empty answers"**: Rephrase questions or check document content
1209
-
1210
- ### System Requirements
1211
- - **HuggingFace Token**: Required for AI model access
1212
- - **Memory**: At least 4GB RAM recommended for large documents
1213
- - **Storage**: Sufficient space for temporary file processing
1214
-
1215
- ## 🔒 Privacy & Security
1216
-
1217
- - Documents are processed locally when possible
1218
- - No document content is permanently stored
1219
- - AI model queries may be sent to HuggingFace servers
1220
- - Remove sensitive information before processing
1221
-
1222
- ## 📚 Supported Features
1223
-
1224
- - **File Types**: PDF documents only
1225
- - **Languages**: Primarily English, limited support for other languages
1226
- - **Document Size**: Up to 50MB per PDF recommended
1227
- - **Concurrent Processing**: Multiple documents simultaneously
1228
-
1229
- ---
1230
-
1231
- *Need more help? Check the console output for detailed error messages and logs.*
1232
- """)
1233
-
1234
- # Event handlers
1235
- def handle_preload():
1236
- return load_preloaded_pdfs()
1237
-
1238
- def handle_zip_extract(zip_file):
1239
- return extract_zip_to_pdfs(zip_file)
1240
-
1241
- def handle_process(pdf_files, chunk_size, chunk_overlap):
1242
- return process_pdfs(pdf_files, chunk_size, chunk_overlap)
1243
-
1244
- def handle_question(question):
1245
- return answer_question(question)
1246
-
1247
- # Connect event handlers
1248
- preload_btn.click(
1249
- fn=handle_preload,
1250
- outputs=preload_status
1251
  )
1252
-
1253
- zip_btn.click(
1254
- fn=handle_zip_extract,
1255
- inputs=zip_file,
1256
- outputs=zip_status
1257
  )
1258
-
1259
  process_btn.click(
1260
- fn=handle_process,
1261
- inputs=[pdf_files, chunk_size, chunk_overlap],
1262
- outputs=status_output
1263
  )
1264
-
1265
  ask_btn.click(
1266
- fn=handle_question,
1267
  inputs=question_input,
1268
  outputs=[answer_output, sources_output]
1269
  )
1270
-
1271
- # Enable Enter key for question input
1272
- question_input.submit(
1273
- fn=handle_question,
1274
- inputs=question_input,
1275
- outputs=[answer_output, sources_output]
1276
- )
1277
-
1278
- # Add keyboard shortcuts info
1279
- gr.Markdown("""
1280
- ---
1281
- **💡 Keyboard Shortcuts**: Press Enter in the question box to ask your question quickly!
1282
- """)
1283
-
1284
- return interface
1285
 
1286
- # Main execution
 
 
 
 
1287
  if __name__ == "__main__":
1288
- # Initialize the interface
1289
- interface = create_interface()
1290
-
1291
- # Check system status
1292
- print("🔍 System Status Check:")
1293
- print(f"✅ LangChain Available: {LANGCHAIN_AVAILABLE}")
1294
- print(f"✅ HuggingFace Hub Available: {HUGGINGFACE_HUB_AVAILABLE}")
1295
- print(f"✅ Pre-loaded PDFs: {PRELOADED_PDFS}")
1296
- print(f"✅ PDF Folder: {PDF_FOLDER_PATH}")
1297
-
1298
- # Check for HuggingFace token
1299
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
1300
- if hf_token:
1301
- print("✅ HuggingFace API Token: Found")
1302
- else:
1303
- print("❌ HuggingFace API Token: Not found - Please set HUGGINGFACEHUB_API_TOKEN environment variable")
1304
-
1305
- # Launch the interface
1306
- try:
1307
- interface.launch(
1308
- server_name="0.0.0.0",
1309
- server_port=7860,
1310
- share=False,
1311
- debug=False,
1312
- show_error=True,
1313
- auth=None,
1314
- favicon_path=None,
1315
- ssl_keyfile=None,
1316
- ssl_certfile=None,
1317
- ssl_keyfile_password=None,
1318
- height=800,
1319
- prevent_thread_lock=False
1320
- )
1321
- except Exception as e:
1322
- logger.error(f"Failed to launch interface: {e}")
1323
- print(f"❌ Failed to launch interface: {e}")
1324
- print("🔧 Try running with: python your_script.py")
 
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
+ # Try importing LangChain components
14
  try:
15
  from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
32
  except ImportError:
33
  try:
34
  from langchain_huggingface import HuggingFaceEndpoint
35
+ HUGGINGFACE_HUB_AVAILABLE = False # HuggingFaceEndpoint doesn't have the same interface as HuggingFaceHub
36
  logger.info("Using HuggingFaceEndpoint as fallback")
37
  except ImportError:
38
  logger.error("No suitable HuggingFace LLM implementation found")
 
70
  # Get HuggingFace token from environment
71
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
72
  if not hf_token:
73
+ return False, "❌ HuggingFace API token not found in environment variables. Please set HUGGINGFACEHUB_API_TOKEN."
74
 
75
  return True, "✅ Models initialized successfully"
76
 
 
83
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
84
 
85
  if not hf_token:
86
+ logger.error("HuggingFace API token not found for LLM creation.")
87
  return create_fallback_llm()
88
 
89
  try:
 
117
  logger.warning(f"Failed to initialize {model_id} with HuggingFaceHub: {model_error}")
118
  continue
119
 
120
+ # Fallback to HuggingFaceEndpoint if HuggingFaceHub is not available or failed
121
  try:
122
  from langchain_huggingface import HuggingFaceEndpoint
123
 
 
148
  logger.warning(f"Failed to initialize {model_id} with HuggingFaceEndpoint: {model_error}")
149
  continue
150
  except ImportError:
151
+ pass # HuggingFaceEndpoint not available
152
 
153
  # If all else fails, return fallback
154
+ raise Exception("All HuggingFace model initialization attempts failed")
155
 
156
  except Exception as e:
157
  logger.error(f"LLM creation error: {e}")
 
196
  def invoke(self, prompt):
197
  return "System temporarily unavailable. Please try again later."
198
 
199
+ def __call__(self, prompt): # For compatibility with older LangChain chains
200
  return self.invoke(prompt)
201
 
202
  return SimpleFallback()
 
223
  documents = loader.load()
224
 
225
  if not documents:
226
+ return "❌ No documents were loaded from the PDFs folder. Ensure the folder contains valid PDFs."
227
 
228
  # Split documents into chunks
229
  text_splitter = RecursiveCharacterTextSplitter(
 
270
  test_result = retrieval_qa({"query": "test"})
271
  logger.info("QA chain test successful")
272
  except Exception as test_error:
273
+ logger.warning(f"QA chain test failed during initial run: {test_error}")
274
  # Chain created but might have issues - continue anyway
275
 
276
  except Exception as chain_error:
277
  logger.error(f"Chain creation error: {chain_error}")
278
+ return f"❌ Error creating QA chain: {str(chain_error)}. Check LLM availability."
279
 
280
  pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
281
  return f"✅ Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
 
303
 
304
  for pdf_file in pdf_files:
305
  # Extract to PDFs folder
306
+ # Ensure the path is safe and doesn't lead to directory traversal
307
+ extracted_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
308
+
309
+ # Check if the extracted path is within the intended PDF_FOLDER_PATH
310
+ if not os.path.abspath(extracted_path).startswith(os.path.abspath(PDF_FOLDER_PATH)):
311
+ logger.warning(f"Attempted path traversal detected: {pdf_file}")
312
+ continue # Skip this file
313
+
314
+ # Extract the file
315
+ with open(extracted_path, "wb") as f:
316
+ f.write(zip_ref.read(pdf_file))
317
 
 
 
 
 
 
 
 
 
 
 
 
318
  global PRELOADED_PDFS
319
  PRELOADED_PDFS = True
320
 
 
401
  test_result = retrieval_qa({"query": "test"})
402
  logger.info("QA chain test successful")
403
  except Exception as test_error:
404
+ logger.warning(f"QA chain test failed during initial run: {test_error}")
405
 
406
  except Exception as chain_error:
407
  logger.error(f"Chain creation error: {chain_error}")
408
+ return f"❌ Error creating QA chain: {str(chain_error)}. Check LLM availability."
409
 
410
  # Clean up temp directory
411
  shutil.rmtree(temp_dir)
 
427
  return "❌ Please upload and process PDF files first.", ""
428
 
429
  try:
430
+ # Get answer from RAG system
431
  result = retrieval_qa({"query": question})
432
 
433
  answer = result.get("result", "No answer generated")
 
448
  except Exception as e:
449
  logger.error(f"Question answering error: {e}")
450
 
451
+ # Provide a fallback response using just the retriever if LLM fails
452
  try:
453
  if vectorstore is not None:
454
  # Get relevant documents directly from vectorstore
 
469
 
470
  return fallback_answer + "\n*Note: This is a direct search result due to a technical issue with the AI model.*", sources_text
471
  else:
472
+ return f"❌ Error answering question: {str(e)}. Vector store not initialized.", ""
473
 
474
  except Exception as fallback_error:
475
+ logger.error(f"Fallback error during question answering: {fallback_error}")
476
+ return f"❌ Critical error answering question: {str(e)}", ""
477
 
478
  def create_interface():
479
  """Create the fully responsive Gradio interface"""
 
530
  min-width: 0 !important;
531
  }
532
 
533
+ /* Remove any pre-existing or default Gradio styling that might conflict */
534
+ .gradio-container,
535
+ .gr-panel,
536
+ .gr-block,
537
+ .gr-group {
538
+ box-sizing: border-box !important;
539
+ min-width: 0 !important; /* Ensure elements can shrink */
540
+ }
541
+
542
+ /* Ensure images and media scale within their containers */
543
+ img, video {
544
+ max-width: 100% !important;
545
+ height: auto !important;
546
+ display: block !important;
547
+ }
548
+
549
+ /* Specific adjustments for file upload area text */
550
+ .gr-file .file-upload-text {
551
+ font-size: clamp(0.75rem, 3vw, 1rem) !important; /* Make text smaller on mobile */
552
+ line-height: 1.4 !important;
553
+ }
554
+
555
  /* Mobile-first responsive breakpoints */
556
 
557
  /* Small devices (phones, 320px and up) */
 
615
  .gr-accordion {
616
  border-radius: var(--radius-md) !important;
617
  border: 1px solid var(--border-color) !important;
618
+ width: 100% !important; /* Force full width */
619
+ flex: none !important; /* Prevent flex issues */
620
+ }
621
+ /* Adjust spacing for accordions within columns */
622
+ .gr-column .gr-accordion {
623
+ margin-bottom: 1rem !important;
624
+ }
625
+
626
+ /* Ensure direct children of gradio-container also respond well */
627
+ .gradio-container > *:not(.gr-footer) { /* Exclude footer if it exists */
628
+ width: 100% !important;
629
+ margin-left: auto !important;
630
+ margin-right: auto !important;
631
+ }
632
+
633
+ /* Make sure all gradio components inside rows take full width */
634
+ .gr-row > .gr-block {
635
+ width: 100% !important;
636
  }
637
 
638
  /* Slider improvements */
 
674
 
675
  /* Two-column layout for medium screens */
676
  .gr-column:first-child {
677
+ flex: 0 0 40% !important;
678
+ max-width: 40% !important;
679
  }
680
 
681
  .gr-column:last-child {
682
+ flex: 1 1 55% !important;
683
+ max-width: 55% !important;
684
+ }
685
+ .gr-row {
686
+ justify-content: space-between !important; /* Distribute space */
687
  }
688
  }
689
 
 
712
 
713
  /* Optimal desktop layout */
714
  .gr-column:first-child {
715
+ flex: 0 0 350px !important;
716
+ max-width: 350px !important;
717
  }
718
 
719
  .gr-column:last-child {
 
831
 
832
  .gr-textbox textarea:focus,
833
  .gr-textbox input:focus {
834
+ border-color: var(--primary-color) !important;
835
  outline: none !important;
836
  box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1) !important;
837
  }
 
928
  background: var(--text-secondary) !important;
929
  }
930
 
931
+ /* Ensure good spacing for text outputs */
932
+ .gr-markdown {
933
+ padding: 1rem 0 !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
934
  }
935
+ """
936
 
937
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
938
+ gr.Markdown(
939
+ """
940
+ # RAG PDF Chat Interface
941
+ Upload PDF documents and ask questions about their content using advanced AI.
942
+
943
+ This interface allows you to:
944
+ - Upload PDF files or ZIP archives containing PDFs
945
+ - Process documents using state-of-the-art text chunking and embedding techniques
946
+ - Ask questions about your documents using natural language
947
+ - Get accurate answers with source citations
948
+ """
949
+ )
950
 
951
+ # Main content area
952
+ with gr.Row():
953
+ with gr.Column(scale=1): # This column will contain processing options
954
+ with gr.Accordion("📁 Pre-loaded PDFs", open=True):
955
+ gr.Markdown("### Option 1: Use pre-existing PDFs")
956
+ gr.Markdown("If you have PDFs in the `./pdfs` folder, click the button below to process them.")
957
+ load_preloaded_btn = gr.Button("🔄 Load Pre-existing PDFs", variant="secondary")
958
+ pre_load_status = gr.Textbox(label="Pre-load Status", interactive=False, value="No pre-loaded PDFs processed yet.")
959
 
960
+ with gr.Accordion("📦 Upload ZIP Archive", open=False):
961
+ gr.Markdown("### Option 2: Upload ZIP Archive")
962
+ zip_file_input = gr.File(label="Upload ZIP File", type="file", file_count="single", file_types=[".zip"])
963
+ extract_zip_btn = gr.Button("📤 Extract ZIP Archive", variant="primary")
964
+ zip_status_output = gr.Textbox(label="ZIP Extraction Status", interactive=False)
965
 
966
+ with gr.Accordion("📄 Upload PDF Files", open=False):
967
+ gr.Markdown("### Option 3: Direct PDF upload")
968
+ gr.Markdown("Upload PDF files directly for processing.")
969
+ pdf_file_input = gr.File(label="Upload PDF Files", type="file", file_count="multiple", file_types=[".pdf"])
 
 
 
 
970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
  with gr.Accordion("⚙️ Processing Parameters", open=False):
972
+ chunk_size_slider = gr.Slider(
973
+ minimum=100,
 
 
 
 
 
 
974
  maximum=2000,
975
  value=1000,
976
+ step=50,
977
  label="Chunk Size",
978
+ info="Size of text chunks for processing."
979
  )
980
+ chunk_overlap_slider = gr.Slider(
 
981
  minimum=0,
982
  maximum=500,
983
  value=200,
984
+ step=10,
985
  label="Chunk Overlap",
986
+ info="Overlap between text chunks to maintain context."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
987
  )
988
+ process_btn = gr.Button("🚀 Process Documents", variant="primary")
989
+ processing_status = gr.Textbox(label="Processing Status", interactive=False)
990
+
991
+ with gr.Column(scale=2): # This column will contain the chat interface
992
+ with gr.Accordion("💬 Chat with Documents", open=True):
993
+ gr.Markdown("### Ask questions about your documents")
994
+ gr.Markdown("Once you've processed your PDFs, you can ask questions about their content. The AI will provide answers based on the information in your documents.")
995
+ question_input = gr.Textbox(label="Ask a question about your documents", placeholder="e.g., What is the main topic of the documents?")
996
+ answer_output = gr.Textbox(label="Answer", interactive=False)
997
+ sources_output = gr.Textbox(label="Sources & References", interactive=False)
998
+ ask_btn = gr.Button("🔍 Ask Question", variant="primary")
999
+
1000
+ gr.Markdown(" Help & Tips: Ensure you have your HuggingFace API token set as an environment variable (HUGGINGFACEHUB_API_TOKEN) for the LLM to function properly.")
1001
+
1002
+ # Event listeners
1003
+ load_preloaded_btn.click(
1004
+ load_preloaded_pdfs,
1005
+ inputs=[chunk_size_slider, chunk_overlap_slider], # Pass sliders to function
1006
+ outputs=pre_load_status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007
  )
1008
+ extract_zip_btn.click(
1009
+ extract_zip_to_pdfs,
1010
+ inputs=zip_file_input,
1011
+ outputs=zip_status_output
 
1012
  )
 
1013
  process_btn.click(
1014
+ process_pdfs,
1015
+ inputs=[pdf_file_input, chunk_size_slider, chunk_overlap_slider],
1016
+ outputs=processing_status
1017
  )
 
1018
  ask_btn.click(
1019
+ answer_question,
1020
  inputs=question_input,
1021
  outputs=[answer_output, sources_output]
1022
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
 
1024
+ # Initial model check
1025
+ demo.load(initialize_models, outputs=pre_load_status) # Use pre_load_status to show init message
1026
+
1027
+ return demo
1028
+
1029
  if __name__ == "__main__":
1030
+ demo = create_interface()
1031
+ # It's better to explicitly set share=False for local development
1032
+ # and only set it to True if you intend to share publicly (which creates a public link)
1033
+ demo.launch(show_api=False, inline=False)
1034
+