sammoftah commited on
Commit
8a319d5
·
verified ·
1 Parent(s): 992675e

Fix PDF upload handling

Browse files
Files changed (1) hide show
  1. app.py +35 -7
app.py CHANGED
@@ -36,11 +36,26 @@ def cosine_similarity(left, right):
36
 
37
  def extract_text_from_pdf(pdf_file):
38
  """Extract text from PDF file."""
39
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  text = ""
41
  for page in pdf_reader.pages:
42
- text += page.extract_text() + "\n"
43
- return text
44
 
45
  def chunk_text(text, chunk_size=500, overlap=50):
46
  """Split text into overlapping chunks."""
@@ -64,10 +79,13 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
64
 
65
  progress(0, desc="Extracting text from PDFs...")
66
  for i, pdf_file in enumerate(pdf_files):
67
- text = extract_text_from_pdf(pdf_file)
 
 
 
68
  pdf_chunks = chunk_text(text)
69
  chunks.extend(pdf_chunks)
70
- sources.extend([pdf_file.name] * len(pdf_chunks))
71
  progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
72
 
73
  if not chunks:
@@ -131,6 +149,8 @@ Question: {question}
131
  Answer:"""
132
 
133
  try:
 
 
134
  response = ""
135
  for token in client.text_generation(
136
  prompt,
@@ -150,13 +170,21 @@ Answer:"""
150
  return response.strip(), chunks_display, citations
151
 
152
  except Exception as e:
153
- return f"Error generating answer: {str(e)}", chunks_display, ""
 
 
 
 
 
 
 
 
154
 
155
  # Gradio Interface
156
  with gr.Blocks(title="RAG from Scratch", theme=gr.themes.Soft()) as demo:
157
  create_premium_hero(
158
  "RAG from Scratch",
159
- "A transparent Retrieval-Augmented Generation lab: chunk PDFs, embed passages, search FAISS, and answer with cited context.",
160
  "📚",
161
  badge="Retrieval Systems",
162
  highlights=["Lexical retrieval", "Chunk inspection", "HF Inference"],
 
36
 
37
  def extract_text_from_pdf(pdf_file):
38
  """Extract text from PDF file."""
39
+ if hasattr(pdf_file, "read"):
40
+ payload = pdf_file.read()
41
+ source_name = getattr(pdf_file, "name", "uploaded.pdf")
42
+ elif isinstance(pdf_file, (str, os.PathLike)):
43
+ source_name = os.path.basename(str(pdf_file))
44
+ with open(pdf_file, "rb") as handle:
45
+ payload = handle.read()
46
+ elif hasattr(pdf_file, "path"):
47
+ source_name = os.path.basename(str(pdf_file.path))
48
+ with open(pdf_file.path, "rb") as handle:
49
+ payload = handle.read()
50
+ else:
51
+ payload = bytes(pdf_file)
52
+ source_name = "uploaded.pdf"
53
+
54
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(payload))
55
  text = ""
56
  for page in pdf_reader.pages:
57
+ text += (page.extract_text() or "") + "\n"
58
+ return text, source_name
59
 
60
  def chunk_text(text, chunk_size=500, overlap=50):
61
  """Split text into overlapping chunks."""
 
79
 
80
  progress(0, desc="Extracting text from PDFs...")
81
  for i, pdf_file in enumerate(pdf_files):
82
+ try:
83
+ text, source_name = extract_text_from_pdf(pdf_file)
84
+ except Exception as exc:
85
+ return f"❌ Could not read PDF: {exc}"
86
  pdf_chunks = chunk_text(text)
87
  chunks.extend(pdf_chunks)
88
+ sources.extend([source_name] * len(pdf_chunks))
89
  progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
90
 
91
  if not chunks:
 
149
  Answer:"""
150
 
151
  try:
152
+ if not os.getenv("HF_TOKEN"):
153
+ raise RuntimeError("HF_TOKEN is not configured; using local extractive fallback.")
154
  response = ""
155
  for token in client.text_generation(
156
  prompt,
 
170
  return response.strip(), chunks_display, citations
171
 
172
  except Exception as e:
173
+ fallback = (
174
+ "No hosted generation token is configured, so this Space is returning the most relevant retrieved evidence instead.\n\n"
175
+ f"**Question:** {question}\n\n"
176
+ f"**Best evidence:** {retrieved_chunks[0][:900]}..."
177
+ )
178
+ citations = "\n\n**Sources:**\n"
179
+ for source in sorted(set(retrieved_sources)):
180
+ citations += f"- {source}\n"
181
+ return fallback, chunks_display, citations
182
 
183
  # Gradio Interface
184
  with gr.Blocks(title="RAG from Scratch", theme=gr.themes.Soft()) as demo:
185
  create_premium_hero(
186
  "RAG from Scratch",
187
+ "A transparent Retrieval-Augmented Generation lab: chunk PDFs, retrieve passages, and answer with cited context.",
188
  "📚",
189
  badge="Retrieval Systems",
190
  highlights=["Lexical retrieval", "Chunk inspection", "HF Inference"],