Spaces:
Sleeping
Sleeping
Fix PDF upload handling
Browse files
app.py
CHANGED
|
@@ -36,11 +36,26 @@ def cosine_similarity(left, right):
|
|
| 36 |
|
| 37 |
def extract_text_from_pdf(pdf_file):
|
| 38 |
"""Extract text from PDF file."""
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
text = ""
|
| 41 |
for page in pdf_reader.pages:
|
| 42 |
-
text += page.extract_text() + "\n"
|
| 43 |
-
return text
|
| 44 |
|
| 45 |
def chunk_text(text, chunk_size=500, overlap=50):
|
| 46 |
"""Split text into overlapping chunks."""
|
|
@@ -64,10 +79,13 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
|
|
| 64 |
|
| 65 |
progress(0, desc="Extracting text from PDFs...")
|
| 66 |
for i, pdf_file in enumerate(pdf_files):
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
pdf_chunks = chunk_text(text)
|
| 69 |
chunks.extend(pdf_chunks)
|
| 70 |
-
sources.extend([
|
| 71 |
progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
|
| 72 |
|
| 73 |
if not chunks:
|
|
@@ -131,6 +149,8 @@ Question: {question}
|
|
| 131 |
Answer:"""
|
| 132 |
|
| 133 |
try:
|
|
|
|
|
|
|
| 134 |
response = ""
|
| 135 |
for token in client.text_generation(
|
| 136 |
prompt,
|
|
@@ -150,13 +170,21 @@ Answer:"""
|
|
| 150 |
return response.strip(), chunks_display, citations
|
| 151 |
|
| 152 |
except Exception as e:
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Gradio Interface
|
| 156 |
with gr.Blocks(title="RAG from Scratch", theme=gr.themes.Soft()) as demo:
|
| 157 |
create_premium_hero(
|
| 158 |
"RAG from Scratch",
|
| 159 |
-
"A transparent Retrieval-Augmented Generation lab: chunk PDFs,
|
| 160 |
"📚",
|
| 161 |
badge="Retrieval Systems",
|
| 162 |
highlights=["Lexical retrieval", "Chunk inspection", "HF Inference"],
|
|
|
|
| 36 |
|
| 37 |
def extract_text_from_pdf(pdf_file):
|
| 38 |
"""Extract text from PDF file."""
|
| 39 |
+
if hasattr(pdf_file, "read"):
|
| 40 |
+
payload = pdf_file.read()
|
| 41 |
+
source_name = getattr(pdf_file, "name", "uploaded.pdf")
|
| 42 |
+
elif isinstance(pdf_file, (str, os.PathLike)):
|
| 43 |
+
source_name = os.path.basename(str(pdf_file))
|
| 44 |
+
with open(pdf_file, "rb") as handle:
|
| 45 |
+
payload = handle.read()
|
| 46 |
+
elif hasattr(pdf_file, "path"):
|
| 47 |
+
source_name = os.path.basename(str(pdf_file.path))
|
| 48 |
+
with open(pdf_file.path, "rb") as handle:
|
| 49 |
+
payload = handle.read()
|
| 50 |
+
else:
|
| 51 |
+
payload = bytes(pdf_file)
|
| 52 |
+
source_name = "uploaded.pdf"
|
| 53 |
+
|
| 54 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(payload))
|
| 55 |
text = ""
|
| 56 |
for page in pdf_reader.pages:
|
| 57 |
+
text += (page.extract_text() or "") + "\n"
|
| 58 |
+
return text, source_name
|
| 59 |
|
| 60 |
def chunk_text(text, chunk_size=500, overlap=50):
|
| 61 |
"""Split text into overlapping chunks."""
|
|
|
|
| 79 |
|
| 80 |
progress(0, desc="Extracting text from PDFs...")
|
| 81 |
for i, pdf_file in enumerate(pdf_files):
|
| 82 |
+
try:
|
| 83 |
+
text, source_name = extract_text_from_pdf(pdf_file)
|
| 84 |
+
except Exception as exc:
|
| 85 |
+
return f"❌ Could not read PDF: {exc}"
|
| 86 |
pdf_chunks = chunk_text(text)
|
| 87 |
chunks.extend(pdf_chunks)
|
| 88 |
+
sources.extend([source_name] * len(pdf_chunks))
|
| 89 |
progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
|
| 90 |
|
| 91 |
if not chunks:
|
|
|
|
| 149 |
Answer:"""
|
| 150 |
|
| 151 |
try:
|
| 152 |
+
if not os.getenv("HF_TOKEN"):
|
| 153 |
+
raise RuntimeError("HF_TOKEN is not configured; using local extractive fallback.")
|
| 154 |
response = ""
|
| 155 |
for token in client.text_generation(
|
| 156 |
prompt,
|
|
|
|
| 170 |
return response.strip(), chunks_display, citations
|
| 171 |
|
| 172 |
except Exception as e:
|
| 173 |
+
fallback = (
|
| 174 |
+
"No hosted generation token is configured, so this Space is returning the most relevant retrieved evidence instead.\n\n"
|
| 175 |
+
f"**Question:** {question}\n\n"
|
| 176 |
+
f"**Best evidence:** {retrieved_chunks[0][:900]}..."
|
| 177 |
+
)
|
| 178 |
+
citations = "\n\n**Sources:**\n"
|
| 179 |
+
for source in sorted(set(retrieved_sources)):
|
| 180 |
+
citations += f"- {source}\n"
|
| 181 |
+
return fallback, chunks_display, citations
|
| 182 |
|
| 183 |
# Gradio Interface
|
| 184 |
with gr.Blocks(title="RAG from Scratch", theme=gr.themes.Soft()) as demo:
|
| 185 |
create_premium_hero(
|
| 186 |
"RAG from Scratch",
|
| 187 |
+
"A transparent Retrieval-Augmented Generation lab: chunk PDFs, retrieve passages, and answer with cited context.",
|
| 188 |
"📚",
|
| 189 |
badge="Retrieval Systems",
|
| 190 |
highlights=["Lexical retrieval", "Chunk inspection", "HF Inference"],
|