Spaces:

sammoftah
/

rag-from-scratch

Sleeping

App Files Files Community

sammoftah commited on Apr 25

Commit

8a319d5

verified ·

1 Parent(s): 992675e

Fix PDF upload handling

Browse files

Files changed (1) hide show

app.py +35 -7

app.py CHANGED Viewed

@@ -36,11 +36,26 @@ def cosine_similarity(left, right):
 def extract_text_from_pdf(pdf_file):
     """Extract text from PDF file."""
-    pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
     text = ""
     for page in pdf_reader.pages:
-        text += page.extract_text() + "\n"
-    return text
 def chunk_text(text, chunk_size=500, overlap=50):
     """Split text into overlapping chunks."""
@@ -64,10 +79,13 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
     progress(0, desc="Extracting text from PDFs...")
     for i, pdf_file in enumerate(pdf_files):
-        text = extract_text_from_pdf(pdf_file)
         pdf_chunks = chunk_text(text)
         chunks.extend(pdf_chunks)
-        sources.extend([pdf_file.name] * len(pdf_chunks))
         progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
     if not chunks:
@@ -131,6 +149,8 @@ Question: {question}
 Answer:"""
     try:
         response = ""
         for token in client.text_generation(
             prompt,
@@ -150,13 +170,21 @@ Answer:"""
         return response.strip(), chunks_display, citations
     except Exception as e:
-        return f"Error generating answer: {str(e)}", chunks_display, ""
 # Gradio Interface
 with gr.Blocks(title="RAG from Scratch", theme=gr.themes.Soft()) as demo:
     create_premium_hero(
         "RAG from Scratch",
-        "A transparent Retrieval-Augmented Generation lab: chunk PDFs, embed passages, search FAISS, and answer with cited context.",
         "📚",
         badge="Retrieval Systems",
         highlights=["Lexical retrieval", "Chunk inspection", "HF Inference"],

 def extract_text_from_pdf(pdf_file):
     """Extract text from PDF file."""
+    if hasattr(pdf_file, "read"):
+        payload = pdf_file.read()
+        source_name = getattr(pdf_file, "name", "uploaded.pdf")
+    elif isinstance(pdf_file, (str, os.PathLike)):
+        source_name = os.path.basename(str(pdf_file))
+        with open(pdf_file, "rb") as handle:
+            payload = handle.read()
+    elif hasattr(pdf_file, "path"):
+        source_name = os.path.basename(str(pdf_file.path))
+        with open(pdf_file.path, "rb") as handle:
+            payload = handle.read()
+    else:
+        payload = bytes(pdf_file)
+        source_name = "uploaded.pdf"
+    pdf_reader = PyPDF2.PdfReader(io.BytesIO(payload))
     text = ""
     for page in pdf_reader.pages:
+        text += (page.extract_text() or "") + "\n"
+    return text, source_name
 def chunk_text(text, chunk_size=500, overlap=50):
     """Split text into overlapping chunks."""
     progress(0, desc="Extracting text from PDFs...")
     for i, pdf_file in enumerate(pdf_files):
+        try:
+            text, source_name = extract_text_from_pdf(pdf_file)
+        except Exception as exc:
+            return f"❌ Could not read PDF: {exc}"
         pdf_chunks = chunk_text(text)
         chunks.extend(pdf_chunks)
+        sources.extend([source_name] * len(pdf_chunks))
         progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
     if not chunks:
 Answer:"""
     try:
+        if not os.getenv("HF_TOKEN"):
+            raise RuntimeError("HF_TOKEN is not configured; using local extractive fallback.")
         response = ""
         for token in client.text_generation(
             prompt,
         return response.strip(), chunks_display, citations
     except Exception as e:
+        fallback = (
+            "No hosted generation token is configured, so this Space is returning the most relevant retrieved evidence instead.\n\n"
+            f"**Question:** {question}\n\n"
+            f"**Best evidence:** {retrieved_chunks[0][:900]}..."
+        )
+        citations = "\n\n**Sources:**\n"
+        for source in sorted(set(retrieved_sources)):
+            citations += f"- {source}\n"
+        return fallback, chunks_display, citations
 # Gradio Interface
 with gr.Blocks(title="RAG from Scratch", theme=gr.themes.Soft()) as demo:
     create_premium_hero(
         "RAG from Scratch",
+        "A transparent Retrieval-Augmented Generation lab: chunk PDFs, retrieve passages, and answer with cited context.",
         "📚",
         badge="Retrieval Systems",
         highlights=["Lexical retrieval", "Chunk inspection", "HF Inference"],