Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ import zipfile
|
|
| 10 |
logging.basicConfig(level=logging.INFO)
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 13 |
try:
|
| 14 |
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
|
| 15 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -31,7 +32,7 @@ try:
|
|
| 31 |
except ImportError:
|
| 32 |
try:
|
| 33 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 34 |
-
HUGGINGFACE_HUB_AVAILABLE = False
|
| 35 |
logger.info("Using HuggingFaceEndpoint as fallback")
|
| 36 |
except ImportError:
|
| 37 |
logger.error("No suitable HuggingFace LLM implementation found")
|
|
@@ -69,7 +70,7 @@ def initialize_models():
|
|
| 69 |
# Get HuggingFace token from environment
|
| 70 |
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 71 |
if not hf_token:
|
| 72 |
-
return False, "❌ HuggingFace API token not found in environment variables"
|
| 73 |
|
| 74 |
return True, "✅ Models initialized successfully"
|
| 75 |
|
|
@@ -82,7 +83,7 @@ def create_llm():
|
|
| 82 |
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 83 |
|
| 84 |
if not hf_token:
|
| 85 |
-
logger.error("HuggingFace API token not found")
|
| 86 |
return create_fallback_llm()
|
| 87 |
|
| 88 |
try:
|
|
@@ -116,7 +117,7 @@ def create_llm():
|
|
| 116 |
logger.warning(f"Failed to initialize {model_id} with HuggingFaceHub: {model_error}")
|
| 117 |
continue
|
| 118 |
|
| 119 |
-
# Fallback to HuggingFaceEndpoint if HuggingFaceHub is not available
|
| 120 |
try:
|
| 121 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 122 |
|
|
@@ -147,10 +148,10 @@ def create_llm():
|
|
| 147 |
logger.warning(f"Failed to initialize {model_id} with HuggingFaceEndpoint: {model_error}")
|
| 148 |
continue
|
| 149 |
except ImportError:
|
| 150 |
-
pass
|
| 151 |
|
| 152 |
# If all else fails, return fallback
|
| 153 |
-
raise Exception("All model initialization attempts failed")
|
| 154 |
|
| 155 |
except Exception as e:
|
| 156 |
logger.error(f"LLM creation error: {e}")
|
|
@@ -195,7 +196,7 @@ def create_fallback_llm():
|
|
| 195 |
def invoke(self, prompt):
|
| 196 |
return "System temporarily unavailable. Please try again later."
|
| 197 |
|
| 198 |
-
def __call__(self, prompt):
|
| 199 |
return self.invoke(prompt)
|
| 200 |
|
| 201 |
return SimpleFallback()
|
|
@@ -222,7 +223,7 @@ def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
|
|
| 222 |
documents = loader.load()
|
| 223 |
|
| 224 |
if not documents:
|
| 225 |
-
return "❌ No documents were loaded from the PDFs folder."
|
| 226 |
|
| 227 |
# Split documents into chunks
|
| 228 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -269,12 +270,12 @@ Helpful Answer:
|
|
| 269 |
test_result = retrieval_qa({"query": "test"})
|
| 270 |
logger.info("QA chain test successful")
|
| 271 |
except Exception as test_error:
|
| 272 |
-
logger.warning(f"QA chain test failed: {test_error}")
|
| 273 |
# Chain created but might have issues - continue anyway
|
| 274 |
|
| 275 |
except Exception as chain_error:
|
| 276 |
logger.error(f"Chain creation error: {chain_error}")
|
| 277 |
-
return f"❌ Error creating QA chain: {str(chain_error)}"
|
| 278 |
|
| 279 |
pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
|
| 280 |
return f"✅ Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
|
|
@@ -302,19 +303,18 @@ def extract_zip_to_pdfs(zip_file):
|
|
| 302 |
|
| 303 |
for pdf_file in pdf_files:
|
| 304 |
# Extract to PDFs folder
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
-
# If file is in a subfolder, move it to the root of PDFs folder
|
| 308 |
-
extracted_path = os.path.join(PDF_FOLDER_PATH, pdf_file)
|
| 309 |
-
if os.path.dirname(pdf_file): # File is in a subfolder
|
| 310 |
-
new_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
|
| 311 |
-
shutil.move(extracted_path, new_path)
|
| 312 |
-
# Clean up empty directories
|
| 313 |
-
try:
|
| 314 |
-
os.rmdir(os.path.dirname(extracted_path))
|
| 315 |
-
except:
|
| 316 |
-
pass
|
| 317 |
-
|
| 318 |
global PRELOADED_PDFS
|
| 319 |
PRELOADED_PDFS = True
|
| 320 |
|
|
@@ -401,11 +401,11 @@ Helpful Answer:
|
|
| 401 |
test_result = retrieval_qa({"query": "test"})
|
| 402 |
logger.info("QA chain test successful")
|
| 403 |
except Exception as test_error:
|
| 404 |
-
logger.warning(f"QA chain test failed: {test_error}")
|
| 405 |
|
| 406 |
except Exception as chain_error:
|
| 407 |
logger.error(f"Chain creation error: {chain_error}")
|
| 408 |
-
return f"❌ Error creating QA chain: {str(chain_error)}"
|
| 409 |
|
| 410 |
# Clean up temp directory
|
| 411 |
shutil.rmtree(temp_dir)
|
|
@@ -427,7 +427,7 @@ def answer_question(question):
|
|
| 427 |
return "❌ Please upload and process PDF files first.", ""
|
| 428 |
|
| 429 |
try:
|
| 430 |
-
# Get answer from RAG system
|
| 431 |
result = retrieval_qa({"query": question})
|
| 432 |
|
| 433 |
answer = result.get("result", "No answer generated")
|
|
@@ -448,7 +448,7 @@ def answer_question(question):
|
|
| 448 |
except Exception as e:
|
| 449 |
logger.error(f"Question answering error: {e}")
|
| 450 |
|
| 451 |
-
# Provide a fallback response using just the retriever
|
| 452 |
try:
|
| 453 |
if vectorstore is not None:
|
| 454 |
# Get relevant documents directly from vectorstore
|
|
@@ -469,11 +469,11 @@ def answer_question(question):
|
|
| 469 |
|
| 470 |
return fallback_answer + "\n*Note: This is a direct search result due to a technical issue with the AI model.*", sources_text
|
| 471 |
else:
|
| 472 |
-
return f"❌ Error answering question: {str(e)}", ""
|
| 473 |
|
| 474 |
except Exception as fallback_error:
|
| 475 |
-
logger.error(f"Fallback error: {fallback_error}")
|
| 476 |
-
return f"❌
|
| 477 |
|
| 478 |
def create_interface():
|
| 479 |
"""Create the fully responsive Gradio interface"""
|
|
@@ -530,6 +530,28 @@ def create_interface():
|
|
| 530 |
min-width: 0 !important;
|
| 531 |
}
|
| 532 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
/* Mobile-first responsive breakpoints */
|
| 534 |
|
| 535 |
/* Small devices (phones, 320px and up) */
|
|
@@ -593,6 +615,24 @@ def create_interface():
|
|
| 593 |
.gr-accordion {
|
| 594 |
border-radius: var(--radius-md) !important;
|
| 595 |
border: 1px solid var(--border-color) !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
}
|
| 597 |
|
| 598 |
/* Slider improvements */
|
|
@@ -634,11 +674,16 @@ def create_interface():
|
|
| 634 |
|
| 635 |
/* Two-column layout for medium screens */
|
| 636 |
.gr-column:first-child {
|
| 637 |
-
flex: 0 0
|
|
|
|
| 638 |
}
|
| 639 |
|
| 640 |
.gr-column:last-child {
|
| 641 |
-
flex:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
}
|
| 643 |
}
|
| 644 |
|
|
@@ -667,7 +712,8 @@ def create_interface():
|
|
| 667 |
|
| 668 |
/* Optimal desktop layout */
|
| 669 |
.gr-column:first-child {
|
| 670 |
-
flex: 0 0
|
|
|
|
| 671 |
}
|
| 672 |
|
| 673 |
.gr-column:last-child {
|
|
@@ -785,7 +831,7 @@ def create_interface():
|
|
| 785 |
|
| 786 |
.gr-textbox textarea:focus,
|
| 787 |
.gr-textbox input:focus {
|
| 788 |
-
|
| 789 |
outline: none !important;
|
| 790 |
box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1) !important;
|
| 791 |
}
|
|
@@ -882,443 +928,107 @@ def create_interface():
|
|
| 882 |
background: var(--text-secondary) !important;
|
| 883 |
}
|
| 884 |
|
| 885 |
-
/*
|
| 886 |
-
.
|
| 887 |
-
|
| 888 |
-
}
|
| 889 |
-
|
| 890 |
-
@keyframes fadeIn {
|
| 891 |
-
from { opacity: 0; transform: translateY(10px); }
|
| 892 |
-
to { opacity: 1; transform: translateY(0); }
|
| 893 |
-
}
|
| 894 |
-
|
| 895 |
-
/* Accessibility improvements */
|
| 896 |
-
.gr-button:focus-visible,
|
| 897 |
-
.gr-textbox input:focus-visible,
|
| 898 |
-
.gr-textbox textarea:focus-visible {
|
| 899 |
-
outline: 2px solid var(--primary-color) !important;
|
| 900 |
-
outline-offset: 2px !important;
|
| 901 |
-
}
|
| 902 |
-
|
| 903 |
-
/* Print styles */
|
| 904 |
-
@media print {
|
| 905 |
-
.gr-button,
|
| 906 |
-
.gr-file,
|
| 907 |
-
.gr-slider {
|
| 908 |
-
display: none !important;
|
| 909 |
-
}
|
| 910 |
-
|
| 911 |
-
.gr-textbox textarea,
|
| 912 |
-
.gr-textbox input {
|
| 913 |
-
border: 1px solid #000 !important;
|
| 914 |
-
background: white !important;
|
| 915 |
-
}
|
| 916 |
-
}
|
| 917 |
-
|
| 918 |
-
/* High contrast mode support */
|
| 919 |
-
@media (prefers-contrast: high) {
|
| 920 |
-
:root {
|
| 921 |
-
--border-color: #000000;
|
| 922 |
-
--text-primary: #000000;
|
| 923 |
-
--text-secondary: #333333;
|
| 924 |
-
--bg-primary: #ffffff;
|
| 925 |
-
--bg-secondary: #f0f0f0;
|
| 926 |
-
}
|
| 927 |
-
}
|
| 928 |
-
|
| 929 |
-
/* Reduced motion support */
|
| 930 |
-
@media (prefers-reduced-motion: reduce) {
|
| 931 |
-
* {
|
| 932 |
-
animation-duration: 0.01ms !important;
|
| 933 |
-
animation-iteration-count: 1 !important;
|
| 934 |
-
transition-duration: 0.01ms !important;
|
| 935 |
-
}
|
| 936 |
-
}
|
| 937 |
-
|
| 938 |
-
/* Error and success states */
|
| 939 |
-
.gr-textbox.error textarea,
|
| 940 |
-
.gr-textbox.error input {
|
| 941 |
-
border-color: #ef4444 !important;
|
| 942 |
-
background: rgba(239, 68, 68, 0.05) !important;
|
| 943 |
-
}
|
| 944 |
-
|
| 945 |
-
.gr-textbox.success textarea,
|
| 946 |
-
.gr-textbox.success input {
|
| 947 |
-
border-color: var(--secondary-color) !important;
|
| 948 |
-
background: rgba(16, 185, 129, 0.05) !important;
|
| 949 |
-
}
|
| 950 |
-
|
| 951 |
-
/* Custom status messages */
|
| 952 |
-
.status-message {
|
| 953 |
-
padding: 0.75rem 1rem !important;
|
| 954 |
-
border-radius: var(--radius-md) !important;
|
| 955 |
-
margin: 0.5rem 0 !important;
|
| 956 |
-
font-size: 0.875rem !important;
|
| 957 |
-
font-weight: 500 !important;
|
| 958 |
}
|
|
|
|
| 959 |
|
| 960 |
-
.
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 965 |
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
|
|
|
|
|
|
|
|
|
| 971 |
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
.gr-file:focus {
|
| 983 |
-
outline: 2px solid var(--primary-color) !important;
|
| 984 |
-
outline-offset: 2px !important;
|
| 985 |
-
}
|
| 986 |
|
| 987 |
-
/* Custom scrollable areas */
|
| 988 |
-
.scrollable-content {
|
| 989 |
-
max-height: 400px !important;
|
| 990 |
-
overflow-y: auto !important;
|
| 991 |
-
padding: 1rem !important;
|
| 992 |
-
background: var(--bg-secondary) !important;
|
| 993 |
-
border-radius: var(--radius-md) !important;
|
| 994 |
-
border: 1px solid var(--border-color) !important;
|
| 995 |
-
}
|
| 996 |
-
"""
|
| 997 |
-
|
| 998 |
-
# Create the interface
|
| 999 |
-
with gr.Blocks(css=custom_css, title="📚 RAG PDF Chat Interface", theme=gr.themes.Soft()) as interface:
|
| 1000 |
-
|
| 1001 |
-
# Header
|
| 1002 |
-
gr.Markdown("""
|
| 1003 |
-
# 📚 RAG PDF Chat Interface
|
| 1004 |
-
|
| 1005 |
-
**Upload PDF documents and ask questions about their content using advanced AI**
|
| 1006 |
-
|
| 1007 |
-
This interface allows you to:
|
| 1008 |
-
- Upload PDF files or ZIP archives containing PDFs
|
| 1009 |
-
- Process documents using state-of-the-art text chunking and embedding techniques
|
| 1010 |
-
- Ask questions about your documents using natural language
|
| 1011 |
-
- Get accurate answers with source citations
|
| 1012 |
-
""")
|
| 1013 |
-
|
| 1014 |
-
# Main interface layout
|
| 1015 |
-
with gr.Row():
|
| 1016 |
-
# Left column - Controls
|
| 1017 |
-
with gr.Column(scale=1):
|
| 1018 |
-
|
| 1019 |
-
# Pre-loaded PDFs section
|
| 1020 |
-
with gr.Accordion("📁 Pre-loaded PDFs", open=PRELOADED_PDFS):
|
| 1021 |
-
gr.Markdown("""
|
| 1022 |
-
**Option 1: Use pre-existing PDFs**
|
| 1023 |
-
|
| 1024 |
-
If you have PDFs in the `./pdfs` folder, click the button below to process them.
|
| 1025 |
-
""")
|
| 1026 |
-
|
| 1027 |
-
preload_btn = gr.Button(
|
| 1028 |
-
"🔄 Load Pre-existing PDFs",
|
| 1029 |
-
variant="secondary",
|
| 1030 |
-
size="sm"
|
| 1031 |
-
)
|
| 1032 |
-
preload_status = gr.Textbox(
|
| 1033 |
-
label="Pre-load Status",
|
| 1034 |
-
interactive=False,
|
| 1035 |
-
lines=2
|
| 1036 |
-
)
|
| 1037 |
-
|
| 1038 |
-
# ZIP upload section
|
| 1039 |
-
with gr.Accordion("📦 Upload ZIP Archive", open=False):
|
| 1040 |
-
gr.Markdown("""
|
| 1041 |
-
**Option 2: Upload ZIP containing PDFs**
|
| 1042 |
-
|
| 1043 |
-
Upload a ZIP file containing PDF documents. They will be extracted to the PDFs folder.
|
| 1044 |
-
""")
|
| 1045 |
-
|
| 1046 |
-
zip_file = gr.File(
|
| 1047 |
-
label="Upload ZIP Archive",
|
| 1048 |
-
file_types=[".zip"],
|
| 1049 |
-
file_count="single"
|
| 1050 |
-
)
|
| 1051 |
-
zip_btn = gr.Button(
|
| 1052 |
-
"📦 Extract ZIP to PDFs",
|
| 1053 |
-
variant="secondary",
|
| 1054 |
-
size="sm"
|
| 1055 |
-
)
|
| 1056 |
-
zip_status = gr.Textbox(
|
| 1057 |
-
label="ZIP Status",
|
| 1058 |
-
interactive=False,
|
| 1059 |
-
lines=2
|
| 1060 |
-
)
|
| 1061 |
-
|
| 1062 |
-
# Direct PDF upload section
|
| 1063 |
-
with gr.Accordion("📄 Upload PDF Files", open=True):
|
| 1064 |
-
gr.Markdown("""
|
| 1065 |
-
**Option 3: Direct PDF upload**
|
| 1066 |
-
|
| 1067 |
-
Upload PDF files directly for processing.
|
| 1068 |
-
""")
|
| 1069 |
-
|
| 1070 |
-
pdf_files = gr.File(
|
| 1071 |
-
label="Upload PDF Files",
|
| 1072 |
-
file_types=[".pdf"],
|
| 1073 |
-
file_count="multiple"
|
| 1074 |
-
)
|
| 1075 |
-
|
| 1076 |
-
# Processing parameters
|
| 1077 |
with gr.Accordion("⚙️ Processing Parameters", open=False):
|
| 1078 |
-
gr.
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
Adjust these parameters to optimize document processing for your specific needs.
|
| 1082 |
-
""")
|
| 1083 |
-
|
| 1084 |
-
chunk_size = gr.Slider(
|
| 1085 |
-
minimum=500,
|
| 1086 |
maximum=2000,
|
| 1087 |
value=1000,
|
| 1088 |
-
step=
|
| 1089 |
label="Chunk Size",
|
| 1090 |
-
info="Size of text chunks for processing
|
| 1091 |
)
|
| 1092 |
-
|
| 1093 |
-
chunk_overlap = gr.Slider(
|
| 1094 |
minimum=0,
|
| 1095 |
maximum=500,
|
| 1096 |
value=200,
|
| 1097 |
-
step=
|
| 1098 |
label="Chunk Overlap",
|
| 1099 |
-
info="Overlap between chunks
|
| 1100 |
-
)
|
| 1101 |
-
|
| 1102 |
-
# Process button
|
| 1103 |
-
process_btn = gr.Button(
|
| 1104 |
-
"🚀 Process Documents",
|
| 1105 |
-
variant="primary",
|
| 1106 |
-
size="lg"
|
| 1107 |
-
)
|
| 1108 |
-
|
| 1109 |
-
# Status display
|
| 1110 |
-
status_output = gr.Textbox(
|
| 1111 |
-
label="Processing Status",
|
| 1112 |
-
interactive=False,
|
| 1113 |
-
lines=4
|
| 1114 |
-
)
|
| 1115 |
-
|
| 1116 |
-
# Right column - Chat interface
|
| 1117 |
-
with gr.Column(scale=2):
|
| 1118 |
-
|
| 1119 |
-
# Chat interface
|
| 1120 |
-
with gr.Tab("💬 Chat with Documents"):
|
| 1121 |
-
gr.Markdown("""
|
| 1122 |
-
**Ask questions about your documents**
|
| 1123 |
-
|
| 1124 |
-
Once you've processed your PDFs, you can ask questions about their content.
|
| 1125 |
-
The AI will provide answers based on the information in your documents.
|
| 1126 |
-
""")
|
| 1127 |
-
|
| 1128 |
-
# Question input
|
| 1129 |
-
question_input = gr.Textbox(
|
| 1130 |
-
label="Ask a question about your documents",
|
| 1131 |
-
placeholder="e.g., What is the main topic discussed in the document?",
|
| 1132 |
-
lines=2
|
| 1133 |
-
)
|
| 1134 |
-
|
| 1135 |
-
# Ask button
|
| 1136 |
-
ask_btn = gr.Button(
|
| 1137 |
-
"🔍 Ask Question",
|
| 1138 |
-
variant="primary",
|
| 1139 |
-
size="lg"
|
| 1140 |
)
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
gr.Markdown("""
|
| 1161 |
-
## 🔧 How to Use This Interface
|
| 1162 |
-
|
| 1163 |
-
### Step 1: Upload Documents
|
| 1164 |
-
Choose one of three options:
|
| 1165 |
-
- **Pre-loaded PDFs**: Use documents already in the `./pdfs` folder
|
| 1166 |
-
- **ZIP Archive**: Upload a ZIP file containing multiple PDFs
|
| 1167 |
-
- **Direct Upload**: Upload PDF files directly
|
| 1168 |
-
|
| 1169 |
-
### Step 2: Process Documents
|
| 1170 |
-
Click "Process Documents" to:
|
| 1171 |
-
- Extract text from PDFs
|
| 1172 |
-
- Split text into manageable chunks
|
| 1173 |
-
- Create embeddings for semantic search
|
| 1174 |
-
- Set up the question-answering system
|
| 1175 |
-
|
| 1176 |
-
### Step 3: Ask Questions
|
| 1177 |
-
Once processing is complete, you can:
|
| 1178 |
-
- Ask specific questions about document content
|
| 1179 |
-
- Get answers with source citations
|
| 1180 |
-
- Explore different aspects of your documents
|
| 1181 |
-
|
| 1182 |
-
## 💡 Tips for Better Results
|
| 1183 |
-
|
| 1184 |
-
### Question Formatting
|
| 1185 |
-
- **Good**: "What are the main findings about climate change?"
|
| 1186 |
-
- **Better**: "What specific evidence does the document provide about climate change impacts?"
|
| 1187 |
-
- **Best**: "According to the research, what are the three most significant climate change impacts on agriculture?"
|
| 1188 |
-
|
| 1189 |
-
### Document Preparation
|
| 1190 |
-
- Use high-quality, text-based PDFs (not scanned images)
|
| 1191 |
-
- Ensure documents are well-structured with clear headings
|
| 1192 |
-
- Remove unnecessary pages to improve processing speed
|
| 1193 |
-
|
| 1194 |
-
### Processing Parameters
|
| 1195 |
-
- **Chunk Size**:
|
| 1196 |
-
- Larger (1500-2000): Better for broad context questions
|
| 1197 |
-
- Smaller (500-1000): Better for specific detail questions
|
| 1198 |
-
- **Chunk Overlap**:
|
| 1199 |
-
- More overlap (200-300): Better context continuity
|
| 1200 |
-
- Less overlap (0-100): Faster processing
|
| 1201 |
-
|
| 1202 |
-
## 🚨 Troubleshooting
|
| 1203 |
-
|
| 1204 |
-
### Common Issues
|
| 1205 |
-
- **"No documents loaded"**: Check PDF file format and quality
|
| 1206 |
-
- **"Model initialization failed"**: Verify HuggingFace token is set
|
| 1207 |
-
- **"Processing timeout"**: Try smaller chunk sizes or fewer documents
|
| 1208 |
-
- **"Empty answers"**: Rephrase questions or check document content
|
| 1209 |
-
|
| 1210 |
-
### System Requirements
|
| 1211 |
-
- **HuggingFace Token**: Required for AI model access
|
| 1212 |
-
- **Memory**: At least 4GB RAM recommended for large documents
|
| 1213 |
-
- **Storage**: Sufficient space for temporary file processing
|
| 1214 |
-
|
| 1215 |
-
## 🔒 Privacy & Security
|
| 1216 |
-
|
| 1217 |
-
- Documents are processed locally when possible
|
| 1218 |
-
- No document content is permanently stored
|
| 1219 |
-
- AI model queries may be sent to HuggingFace servers
|
| 1220 |
-
- Remove sensitive information before processing
|
| 1221 |
-
|
| 1222 |
-
## 📚 Supported Features
|
| 1223 |
-
|
| 1224 |
-
- **File Types**: PDF documents only
|
| 1225 |
-
- **Languages**: Primarily English, limited support for other languages
|
| 1226 |
-
- **Document Size**: Up to 50MB per PDF recommended
|
| 1227 |
-
- **Concurrent Processing**: Multiple documents simultaneously
|
| 1228 |
-
|
| 1229 |
-
---
|
| 1230 |
-
|
| 1231 |
-
*Need more help? Check the console output for detailed error messages and logs.*
|
| 1232 |
-
""")
|
| 1233 |
-
|
| 1234 |
-
# Event handlers
|
| 1235 |
-
def handle_preload():
|
| 1236 |
-
return load_preloaded_pdfs()
|
| 1237 |
-
|
| 1238 |
-
def handle_zip_extract(zip_file):
|
| 1239 |
-
return extract_zip_to_pdfs(zip_file)
|
| 1240 |
-
|
| 1241 |
-
def handle_process(pdf_files, chunk_size, chunk_overlap):
|
| 1242 |
-
return process_pdfs(pdf_files, chunk_size, chunk_overlap)
|
| 1243 |
-
|
| 1244 |
-
def handle_question(question):
|
| 1245 |
-
return answer_question(question)
|
| 1246 |
-
|
| 1247 |
-
# Connect event handlers
|
| 1248 |
-
preload_btn.click(
|
| 1249 |
-
fn=handle_preload,
|
| 1250 |
-
outputs=preload_status
|
| 1251 |
)
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
|
| 1256 |
-
outputs=zip_status
|
| 1257 |
)
|
| 1258 |
-
|
| 1259 |
process_btn.click(
|
| 1260 |
-
|
| 1261 |
-
inputs=[
|
| 1262 |
-
outputs=
|
| 1263 |
)
|
| 1264 |
-
|
| 1265 |
ask_btn.click(
|
| 1266 |
-
|
| 1267 |
inputs=question_input,
|
| 1268 |
outputs=[answer_output, sources_output]
|
| 1269 |
)
|
| 1270 |
-
|
| 1271 |
-
# Enable Enter key for question input
|
| 1272 |
-
question_input.submit(
|
| 1273 |
-
fn=handle_question,
|
| 1274 |
-
inputs=question_input,
|
| 1275 |
-
outputs=[answer_output, sources_output]
|
| 1276 |
-
)
|
| 1277 |
-
|
| 1278 |
-
# Add keyboard shortcuts info
|
| 1279 |
-
gr.Markdown("""
|
| 1280 |
-
---
|
| 1281 |
-
**💡 Keyboard Shortcuts**: Press Enter in the question box to ask your question quickly!
|
| 1282 |
-
""")
|
| 1283 |
-
|
| 1284 |
-
return interface
|
| 1285 |
|
| 1286 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1287 |
if __name__ == "__main__":
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
-
|
| 1293 |
-
print(f"✅ LangChain Available: {LANGCHAIN_AVAILABLE}")
|
| 1294 |
-
print(f"✅ HuggingFace Hub Available: {HUGGINGFACE_HUB_AVAILABLE}")
|
| 1295 |
-
print(f"✅ Pre-loaded PDFs: {PRELOADED_PDFS}")
|
| 1296 |
-
print(f"✅ PDF Folder: {PDF_FOLDER_PATH}")
|
| 1297 |
-
|
| 1298 |
-
# Check for HuggingFace token
|
| 1299 |
-
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 1300 |
-
if hf_token:
|
| 1301 |
-
print("✅ HuggingFace API Token: Found")
|
| 1302 |
-
else:
|
| 1303 |
-
print("❌ HuggingFace API Token: Not found - Please set HUGGINGFACEHUB_API_TOKEN environment variable")
|
| 1304 |
-
|
| 1305 |
-
# Launch the interface
|
| 1306 |
-
try:
|
| 1307 |
-
interface.launch(
|
| 1308 |
-
server_name="0.0.0.0",
|
| 1309 |
-
server_port=7860,
|
| 1310 |
-
share=False,
|
| 1311 |
-
debug=False,
|
| 1312 |
-
show_error=True,
|
| 1313 |
-
auth=None,
|
| 1314 |
-
favicon_path=None,
|
| 1315 |
-
ssl_keyfile=None,
|
| 1316 |
-
ssl_certfile=None,
|
| 1317 |
-
ssl_keyfile_password=None,
|
| 1318 |
-
height=800,
|
| 1319 |
-
prevent_thread_lock=False
|
| 1320 |
-
)
|
| 1321 |
-
except Exception as e:
|
| 1322 |
-
logger.error(f"Failed to launch interface: {e}")
|
| 1323 |
-
print(f"❌ Failed to launch interface: {e}")
|
| 1324 |
-
print("🔧 Try running with: python your_script.py")
|
|
|
|
| 10 |
logging.basicConfig(level=logging.INFO)
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
+
# Try importing LangChain components
|
| 14 |
try:
|
| 15 |
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
|
| 16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 32 |
except ImportError:
|
| 33 |
try:
|
| 34 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 35 |
+
HUGGINGFACE_HUB_AVAILABLE = False # HuggingFaceEndpoint doesn't have the same interface as HuggingFaceHub
|
| 36 |
logger.info("Using HuggingFaceEndpoint as fallback")
|
| 37 |
except ImportError:
|
| 38 |
logger.error("No suitable HuggingFace LLM implementation found")
|
|
|
|
| 70 |
# Get HuggingFace token from environment
|
| 71 |
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 72 |
if not hf_token:
|
| 73 |
+
return False, "❌ HuggingFace API token not found in environment variables. Please set HUGGINGFACEHUB_API_TOKEN."
|
| 74 |
|
| 75 |
return True, "✅ Models initialized successfully"
|
| 76 |
|
|
|
|
| 83 |
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 84 |
|
| 85 |
if not hf_token:
|
| 86 |
+
logger.error("HuggingFace API token not found for LLM creation.")
|
| 87 |
return create_fallback_llm()
|
| 88 |
|
| 89 |
try:
|
|
|
|
| 117 |
logger.warning(f"Failed to initialize {model_id} with HuggingFaceHub: {model_error}")
|
| 118 |
continue
|
| 119 |
|
| 120 |
+
# Fallback to HuggingFaceEndpoint if HuggingFaceHub is not available or failed
|
| 121 |
try:
|
| 122 |
from langchain_huggingface import HuggingFaceEndpoint
|
| 123 |
|
|
|
|
| 148 |
logger.warning(f"Failed to initialize {model_id} with HuggingFaceEndpoint: {model_error}")
|
| 149 |
continue
|
| 150 |
except ImportError:
|
| 151 |
+
pass # HuggingFaceEndpoint not available
|
| 152 |
|
| 153 |
# If all else fails, return fallback
|
| 154 |
+
raise Exception("All HuggingFace model initialization attempts failed")
|
| 155 |
|
| 156 |
except Exception as e:
|
| 157 |
logger.error(f"LLM creation error: {e}")
|
|
|
|
| 196 |
def invoke(self, prompt):
|
| 197 |
return "System temporarily unavailable. Please try again later."
|
| 198 |
|
| 199 |
+
def __call__(self, prompt): # For compatibility with older LangChain chains
|
| 200 |
return self.invoke(prompt)
|
| 201 |
|
| 202 |
return SimpleFallback()
|
|
|
|
| 223 |
documents = loader.load()
|
| 224 |
|
| 225 |
if not documents:
|
| 226 |
+
return "❌ No documents were loaded from the PDFs folder. Ensure the folder contains valid PDFs."
|
| 227 |
|
| 228 |
# Split documents into chunks
|
| 229 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 270 |
test_result = retrieval_qa({"query": "test"})
|
| 271 |
logger.info("QA chain test successful")
|
| 272 |
except Exception as test_error:
|
| 273 |
+
logger.warning(f"QA chain test failed during initial run: {test_error}")
|
| 274 |
# Chain created but might have issues - continue anyway
|
| 275 |
|
| 276 |
except Exception as chain_error:
|
| 277 |
logger.error(f"Chain creation error: {chain_error}")
|
| 278 |
+
return f"❌ Error creating QA chain: {str(chain_error)}. Check LLM availability."
|
| 279 |
|
| 280 |
pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
|
| 281 |
return f"✅ Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
|
|
|
|
| 303 |
|
| 304 |
for pdf_file in pdf_files:
|
| 305 |
# Extract to PDFs folder
|
| 306 |
+
# Ensure the path is safe and doesn't lead to directory traversal
|
| 307 |
+
extracted_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
|
| 308 |
+
|
| 309 |
+
# Check if the extracted path is within the intended PDF_FOLDER_PATH
|
| 310 |
+
if not os.path.abspath(extracted_path).startswith(os.path.abspath(PDF_FOLDER_PATH)):
|
| 311 |
+
logger.warning(f"Attempted path traversal detected: {pdf_file}")
|
| 312 |
+
continue # Skip this file
|
| 313 |
+
|
| 314 |
+
# Extract the file
|
| 315 |
+
with open(extracted_path, "wb") as f:
|
| 316 |
+
f.write(zip_ref.read(pdf_file))
|
| 317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
global PRELOADED_PDFS
|
| 319 |
PRELOADED_PDFS = True
|
| 320 |
|
|
|
|
| 401 |
test_result = retrieval_qa({"query": "test"})
|
| 402 |
logger.info("QA chain test successful")
|
| 403 |
except Exception as test_error:
|
| 404 |
+
logger.warning(f"QA chain test failed during initial run: {test_error}")
|
| 405 |
|
| 406 |
except Exception as chain_error:
|
| 407 |
logger.error(f"Chain creation error: {chain_error}")
|
| 408 |
+
return f"❌ Error creating QA chain: {str(chain_error)}. Check LLM availability."
|
| 409 |
|
| 410 |
# Clean up temp directory
|
| 411 |
shutil.rmtree(temp_dir)
|
|
|
|
| 427 |
return "❌ Please upload and process PDF files first.", ""
|
| 428 |
|
| 429 |
try:
|
| 430 |
+
# Get answer from RAG system
|
| 431 |
result = retrieval_qa({"query": question})
|
| 432 |
|
| 433 |
answer = result.get("result", "No answer generated")
|
|
|
|
| 448 |
except Exception as e:
|
| 449 |
logger.error(f"Question answering error: {e}")
|
| 450 |
|
| 451 |
+
# Provide a fallback response using just the retriever if LLM fails
|
| 452 |
try:
|
| 453 |
if vectorstore is not None:
|
| 454 |
# Get relevant documents directly from vectorstore
|
|
|
|
| 469 |
|
| 470 |
return fallback_answer + "\n*Note: This is a direct search result due to a technical issue with the AI model.*", sources_text
|
| 471 |
else:
|
| 472 |
+
return f"❌ Error answering question: {str(e)}. Vector store not initialized.", ""
|
| 473 |
|
| 474 |
except Exception as fallback_error:
|
| 475 |
+
logger.error(f"Fallback error during question answering: {fallback_error}")
|
| 476 |
+
return f"❌ Critical error answering question: {str(e)}", ""
|
| 477 |
|
| 478 |
def create_interface():
|
| 479 |
"""Create the fully responsive Gradio interface"""
|
|
|
|
| 530 |
min-width: 0 !important;
|
| 531 |
}
|
| 532 |
|
| 533 |
+
/* Remove any pre-existing or default Gradio styling that might conflict */
|
| 534 |
+
.gradio-container,
|
| 535 |
+
.gr-panel,
|
| 536 |
+
.gr-block,
|
| 537 |
+
.gr-group {
|
| 538 |
+
box-sizing: border-box !important;
|
| 539 |
+
min-width: 0 !important; /* Ensure elements can shrink */
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
/* Ensure images and media scale within their containers */
|
| 543 |
+
img, video {
|
| 544 |
+
max-width: 100% !important;
|
| 545 |
+
height: auto !important;
|
| 546 |
+
display: block !important;
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
/* Specific adjustments for file upload area text */
|
| 550 |
+
.gr-file .file-upload-text {
|
| 551 |
+
font-size: clamp(0.75rem, 3vw, 1rem) !important; /* Make text smaller on mobile */
|
| 552 |
+
line-height: 1.4 !important;
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
/* Mobile-first responsive breakpoints */
|
| 556 |
|
| 557 |
/* Small devices (phones, 320px and up) */
|
|
|
|
| 615 |
.gr-accordion {
|
| 616 |
border-radius: var(--radius-md) !important;
|
| 617 |
border: 1px solid var(--border-color) !important;
|
| 618 |
+
width: 100% !important; /* Force full width */
|
| 619 |
+
flex: none !important; /* Prevent flex issues */
|
| 620 |
+
}
|
| 621 |
+
/* Adjust spacing for accordions within columns */
|
| 622 |
+
.gr-column .gr-accordion {
|
| 623 |
+
margin-bottom: 1rem !important;
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
/* Ensure direct children of gradio-container also respond well */
|
| 627 |
+
.gradio-container > *:not(.gr-footer) { /* Exclude footer if it exists */
|
| 628 |
+
width: 100% !important;
|
| 629 |
+
margin-left: auto !important;
|
| 630 |
+
margin-right: auto !important;
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
/* Make sure all gradio components inside rows take full width */
|
| 634 |
+
.gr-row > .gr-block {
|
| 635 |
+
width: 100% !important;
|
| 636 |
}
|
| 637 |
|
| 638 |
/* Slider improvements */
|
|
|
|
| 674 |
|
| 675 |
/* Two-column layout for medium screens */
|
| 676 |
.gr-column:first-child {
|
| 677 |
+
flex: 0 0 40% !important;
|
| 678 |
+
max-width: 40% !important;
|
| 679 |
}
|
| 680 |
|
| 681 |
.gr-column:last-child {
|
| 682 |
+
flex: 1 1 55% !important;
|
| 683 |
+
max-width: 55% !important;
|
| 684 |
+
}
|
| 685 |
+
.gr-row {
|
| 686 |
+
justify-content: space-between !important; /* Distribute space */
|
| 687 |
}
|
| 688 |
}
|
| 689 |
|
|
|
|
| 712 |
|
| 713 |
/* Optimal desktop layout */
|
| 714 |
.gr-column:first-child {
|
| 715 |
+
flex: 0 0 350px !important;
|
| 716 |
+
max-width: 350px !important;
|
| 717 |
}
|
| 718 |
|
| 719 |
.gr-column:last-child {
|
|
|
|
| 831 |
|
| 832 |
.gr-textbox textarea:focus,
|
| 833 |
.gr-textbox input:focus {
|
| 834 |
+
border-color: var(--primary-color) !important;
|
| 835 |
outline: none !important;
|
| 836 |
box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1) !important;
|
| 837 |
}
|
|
|
|
| 928 |
background: var(--text-secondary) !important;
|
| 929 |
}
|
| 930 |
|
| 931 |
+
/* Ensure good spacing for text outputs */
|
| 932 |
+
.gr-markdown {
|
| 933 |
+
padding: 1rem 0 !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 934 |
}
|
| 935 |
+
"""
|
| 936 |
|
| 937 |
+
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
| 938 |
+
gr.Markdown(
|
| 939 |
+
"""
|
| 940 |
+
# RAG PDF Chat Interface
|
| 941 |
+
Upload PDF documents and ask questions about their content using advanced AI.
|
| 942 |
+
|
| 943 |
+
This interface allows you to:
|
| 944 |
+
- Upload PDF files or ZIP archives containing PDFs
|
| 945 |
+
- Process documents using state-of-the-art text chunking and embedding techniques
|
| 946 |
+
- Ask questions about your documents using natural language
|
| 947 |
+
- Get accurate answers with source citations
|
| 948 |
+
"""
|
| 949 |
+
)
|
| 950 |
|
| 951 |
+
# Main content area
|
| 952 |
+
with gr.Row():
|
| 953 |
+
with gr.Column(scale=1): # This column will contain processing options
|
| 954 |
+
with gr.Accordion("📁 Pre-loaded PDFs", open=True):
|
| 955 |
+
gr.Markdown("### Option 1: Use pre-existing PDFs")
|
| 956 |
+
gr.Markdown("If you have PDFs in the `./pdfs` folder, click the button below to process them.")
|
| 957 |
+
load_preloaded_btn = gr.Button("🔄 Load Pre-existing PDFs", variant="secondary")
|
| 958 |
+
pre_load_status = gr.Textbox(label="Pre-load Status", interactive=False, value="No pre-loaded PDFs processed yet.")
|
| 959 |
|
| 960 |
+
with gr.Accordion("📦 Upload ZIP Archive", open=False):
|
| 961 |
+
gr.Markdown("### Option 2: Upload ZIP Archive")
|
| 962 |
+
zip_file_input = gr.File(label="Upload ZIP File", type="file", file_count="single", file_types=[".zip"])
|
| 963 |
+
extract_zip_btn = gr.Button("📤 Extract ZIP Archive", variant="primary")
|
| 964 |
+
zip_status_output = gr.Textbox(label="ZIP Extraction Status", interactive=False)
|
| 965 |
|
| 966 |
+
with gr.Accordion("📄 Upload PDF Files", open=False):
|
| 967 |
+
gr.Markdown("### Option 3: Direct PDF upload")
|
| 968 |
+
gr.Markdown("Upload PDF files directly for processing.")
|
| 969 |
+
pdf_file_input = gr.File(label="Upload PDF Files", type="file", file_count="multiple", file_types=[".pdf"])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 971 |
with gr.Accordion("⚙️ Processing Parameters", open=False):
|
| 972 |
+
chunk_size_slider = gr.Slider(
|
| 973 |
+
minimum=100,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
maximum=2000,
|
| 975 |
value=1000,
|
| 976 |
+
step=50,
|
| 977 |
label="Chunk Size",
|
| 978 |
+
info="Size of text chunks for processing."
|
| 979 |
)
|
| 980 |
+
chunk_overlap_slider = gr.Slider(
|
|
|
|
| 981 |
minimum=0,
|
| 982 |
maximum=500,
|
| 983 |
value=200,
|
| 984 |
+
step=10,
|
| 985 |
label="Chunk Overlap",
|
| 986 |
+
info="Overlap between text chunks to maintain context."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
)
|
| 988 |
+
process_btn = gr.Button("🚀 Process Documents", variant="primary")
|
| 989 |
+
processing_status = gr.Textbox(label="Processing Status", interactive=False)
|
| 990 |
+
|
| 991 |
+
with gr.Column(scale=2): # This column will contain the chat interface
|
| 992 |
+
with gr.Accordion("💬 Chat with Documents", open=True):
|
| 993 |
+
gr.Markdown("### Ask questions about your documents")
|
| 994 |
+
gr.Markdown("Once you've processed your PDFs, you can ask questions about their content. The AI will provide answers based on the information in your documents.")
|
| 995 |
+
question_input = gr.Textbox(label="Ask a question about your documents", placeholder="e.g., What is the main topic of the documents?")
|
| 996 |
+
answer_output = gr.Textbox(label="Answer", interactive=False)
|
| 997 |
+
sources_output = gr.Textbox(label="Sources & References", interactive=False)
|
| 998 |
+
ask_btn = gr.Button("🔍 Ask Question", variant="primary")
|
| 999 |
+
|
| 1000 |
+
gr.Markdown("❓ Help & Tips: Ensure you have your HuggingFace API token set as an environment variable (HUGGINGFACEHUB_API_TOKEN) for the LLM to function properly.")
|
| 1001 |
+
|
| 1002 |
+
# Event listeners
|
| 1003 |
+
load_preloaded_btn.click(
|
| 1004 |
+
load_preloaded_pdfs,
|
| 1005 |
+
inputs=[chunk_size_slider, chunk_overlap_slider], # Pass sliders to function
|
| 1006 |
+
outputs=pre_load_status
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1007 |
)
|
| 1008 |
+
extract_zip_btn.click(
|
| 1009 |
+
extract_zip_to_pdfs,
|
| 1010 |
+
inputs=zip_file_input,
|
| 1011 |
+
outputs=zip_status_output
|
|
|
|
| 1012 |
)
|
|
|
|
| 1013 |
process_btn.click(
|
| 1014 |
+
process_pdfs,
|
| 1015 |
+
inputs=[pdf_file_input, chunk_size_slider, chunk_overlap_slider],
|
| 1016 |
+
outputs=processing_status
|
| 1017 |
)
|
|
|
|
| 1018 |
ask_btn.click(
|
| 1019 |
+
answer_question,
|
| 1020 |
inputs=question_input,
|
| 1021 |
outputs=[answer_output, sources_output]
|
| 1022 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
|
| 1024 |
+
# Initial model check
|
| 1025 |
+
demo.load(initialize_models, outputs=pre_load_status) # Use pre_load_status to show init message
|
| 1026 |
+
|
| 1027 |
+
return demo
|
| 1028 |
+
|
| 1029 |
if __name__ == "__main__":
|
| 1030 |
+
demo = create_interface()
|
| 1031 |
+
# It's better to explicitly set share=False for local development
|
| 1032 |
+
# and only set it to True if you intend to share publicly (which creates a public link)
|
| 1033 |
+
demo.launch(show_api=False, inline=False)
|
| 1034 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|