Spaces:

HamidOmarov
/

PDF-QA-Generator

Sleeping

App Files Files Community

HamidOmarov commited on Aug 13, 2025

Commit

7740cb7

1 Parent(s): 2db539f

Make Space self-contained: no LLM deps; proper Docker; README frontmatter

Browse files

Files changed (6) hide show

.gitattributes +2 -2
Dockerfile +16 -6
README.md +11 -6
generator.py +48 -49
main.py +31 -31
requirements.txt +5 -5

.gitattributes CHANGED Viewed

@@ -1,4 +1,4 @@
-* text=auto
 *.md text eol=lf
 *.py text eol=lf
-Dockerfile text eol=lf

+* text=auto eol=lf
 *.md text eol=lf
 *.py text eol=lf
+Dockerfile text eol=lf

Dockerfile CHANGED Viewed

@@ -1,7 +1,17 @@
-FROM python:3.10-slim
 WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-COPY . .
-ENV PORT=7860
-CMD ["python", "main.py"]

+# Dockerfile
+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PORT=7860
 WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,13 +1,18 @@
----
 title: PDF Q&A Generator
-emoji: ❓
-colorFrom: purple
-colorTo: indigo
 sdk: docker
 pinned: false
 ---
 # PDF Q&A Generator
-FastAPI service that turns a PDF into Q&A pairs.
-Endpoints: `/`, `/health`, `/generate` (multipart file `file`, param `num_questions`).

+---
 title: PDF Q&A Generator
+emoji: 🧩
+colorFrom: indigo
+colorTo: blue
 sdk: docker
 pinned: false
 ---
 # PDF Q&A Generator
+Convert any PDF into a small Q&A dataset (no external API keys).
+## Usage
+- `POST /generate` with a PDF file (`file`) and optional `num_questions` (default 10)
+- `GET /health` for status
+This Space uses **docker** and serves FastAPI on port **7860**.

generator.py CHANGED Viewed

@@ -1,57 +1,56 @@
-import os, re
 from typing import List, Dict
 class QAGenerator:
-    def __init__(self):
-        self.use_groq = False
-        try:
-            from langchain_groq import ChatGroq  # noqa
-            key = os.getenv("GROQ_API_KEY")
-            if key:
-                self.ChatGroq = ChatGroq
-                self.llm = ChatGroq(api_key=key, model="llama-3.1-70b-versatile")
-                self.use_groq = True
-        except Exception:
-            self.use_groq = False
-    def chunk_text(self, text: str, max_chars: int = 1200) -> List[str]:
-        text = re.sub(r"\s+", " ", text)
-        chunks = []
-        for i in range(0, len(text), max_chars):
-            c = text[i:i+max_chars].strip()
-            if len(c) > 200:
-                chunks.append(c)
-        return chunks
-    def _gen_one_llm(self, chunk: str):
-        q = self.llm.invoke(
-            f"Write ONE specific, clear question about this text:\n\n{chunk}\n\nQuestion:"
-        ).content.strip()
-        a = self.llm.invoke(
-            f"Answer the question using ONLY the text.\n\nText:\n{chunk}\n\nQuestion: {q}\n\nAnswer:"
-        ).content.strip()
-        return q, a
-    def _gen_one_heuristic(self, chunk: str):
         sents = re.split(r'(?<=[.!?])\s+', chunk)
-        first = (sents[0] if sents else chunk)[:180]
-        q = f"What is the main point of: \"{first}...\"?"
-        a = first.strip()
-        return q, a
-    def generate(self, text: str, n: int = 10) -> List[Dict]:
-        pairs = []
-        for i, c in enumerate(self.chunk_text(text)):
-            if len(pairs) >= n: break
-            if self.use_groq:
-                try:
-                    q, a = self._gen_one_llm(c)
-                except Exception:
-                    q, a = self._gen_one_heuristic(c)
-            else:
-                q, a = self._gen_one_heuristic(c)
-            pairs.append({
-                "id": i+1, "question": q.strip(), "answer": a.strip(),
-                "source_excerpt": (c[:200] + "...") if len(c) > 200 else c
             })
-        return pairs

+# generator.py
 from typing import List, Dict
 class QAGenerator:
+    """
+    Deterministik, sadə Q&A generatoru (LLM-siz).
+    Məqsəd: Space-in problemsiz ayağa qalxması və sürətli demo.
+    """
+    def chunk_text(self, text: str, min_len: int = 200, max_len: int = 1200) -> List[str]:
+        # Sətirləri birləşdir, çox qısa hissələri at
+        import re
+        cleaned = re.sub(r'\s+', ' ', text).strip()
+        if not cleaned:
+            return []
+        # Sadə söz əsaslı bölmə
+        words = cleaned.split()
+        chunks, step = [], 220
+        for i in range(0, len(words), step):
+            part = " ".join(words[i:i+step])
+            if len(part) >= min_len:
+                chunks.append(part[:max_len])
+        return chunks or ([cleaned[:max_len]] if cleaned else [])
+    def make_question(self, chunk: str, idx: int) -> str:
+        # Yüngül sual şablonları (deterministik)
+        templates = [
+            "What is the main idea of this section?",
+            "List the key points mentioned here.",
+            "Summarize the main purpose discussed.",
+            "What steps or procedures are described?",
+            "Which entities or tools are referenced?"
+        ]
+        return templates[idx % len(templates)]
+    def make_answer(self, chunk: str) -> str:
+        # Heuristik cavab: chunk-ın ilk 1-2 cümləsini qaytar
+        import re
         sents = re.split(r'(?<=[.!?])\s+', chunk)
+        head = " ".join(sents[:2]).strip()
+        return head if head else chunk[:300]
+    def generate(self, text: str, num_questions: int = 10) -> List[Dict]:
+        chunks = self.chunk_text(text)
+        if not chunks:
+            return []
+        out = []
+        for i, chunk in enumerate(chunks[:max(1, num_questions)]):
+            q = self.make_question(chunk, i)
+            a = self.make_answer(chunk)
+            out.append({
+                "id": i+1,
+                "question": q,
+                "answer": a,
+                "source_excerpt": chunk[:220] + ("..." if len(chunk) > 220 else "")
             })
+        return out

main.py CHANGED Viewed

@@ -1,32 +1,32 @@
-import os, io
 from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
 try:
     from pypdf import PdfReader
 except Exception:
-    from PyPDF2 import PdfReader  # fallback
 from generator import QAGenerator
 app = FastAPI(
     title="PDF Q&A Dataset Generator",
-    description="Turn any PDF into Q&A pairs (LLM optional)",
     version="1.0.0",
 )
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
-)
 gen = QAGenerator()
 @app.get("/")
 def root():
     return {
-        "service": "PDF Q&A Dataset Generator",
         "endpoints": ["/generate", "/health"],
-        "notes": "Set GROQ_API_KEY for LLM; otherwise uses heuristic fallback."
     }
 @app.get("/health")
@@ -36,29 +36,29 @@ def health():
 @app.post("/generate")
 async def generate_dataset(file: UploadFile = File(...), num_questions: int = 10):
     if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(400, "Only PDF files are supported")
-    data = await file.read()
-    reader = PdfReader(io.BytesIO(data))
-    text = []
-    for p in reader.pages:
-        try:
             t = p.extract_text() or ""
-        except Exception:
-            t = ""
-        if t.strip():
-            text.append(t)
-    full = "\n".join(text).strip()
-    if not full:
-        raise HTTPException(400, "No extractable text found in PDF")
-    qa = gen.generate(full, num_questions)
     return {
         "filename": file.filename,
         "qa_count": len(qa),
         "dataset": qa,
-        "export_formats": ["json", "csv", "jsonl"]
-    }
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))

+# main.py
 from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+import io
+import os
+# Prefer pypdf, fallback PyPDF2
 try:
     from pypdf import PdfReader
 except Exception:
+    from PyPDF2 import PdfReader  # type: ignore
 from generator import QAGenerator
 app = FastAPI(
     title="PDF Q&A Dataset Generator",
+    description="Turn a PDF into a small Q&A dataset for demos.",
     version="1.0.0",
 )
 gen = QAGenerator()
 @app.get("/")
 def root():
     return {
+        "service": "PDF-QA-Generator",
         "endpoints": ["/generate", "/health"],
+        "sdk": "docker",
+        "port_note": "Container listens on $PORT (default 7860)",
     }
 @app.get("/health")
 @app.post("/generate")
 async def generate_dataset(file: UploadFile = File(...), num_questions: int = 10):
     if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
+    pdf_bytes = await file.read()
+    if not pdf_bytes:
+        raise HTTPException(status_code=400, detail="Empty file.")
+    try:
+        reader = PdfReader(io.BytesIO(pdf_bytes))
+        text_parts = []
+        for p in reader.pages:
             t = p.extract_text() or ""
+            if t.strip():
+                text_parts.append(t)
+        text = "\n".join(text_parts).strip()
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"PDF parse error: {e}")
+    if not text:
+        return {"filename": file.filename, "qa_count": 0, "dataset": []}
+    qa = gen.generate(text, num_questions=num_questions)
     return {
         "filename": file.filename,
         "qa_count": len(qa),
         "dataset": qa,
+        "export_formats": ["json", "jsonl", "csv (client-side)"]
+    }

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-fastapi
-uvicorn
-pypdf
-PyPDF2
-langchain-groq

+fastapi>=0.110
+uvicorn[standard]>=0.30
+pypdf2>=3.0
+pypdf>=4.2
+python-multipart>=0.0.9