HamidOmarov commited on
Commit
7740cb7
·
1 Parent(s): 2db539f

Make Space self-contained: no LLM deps; proper Docker; README frontmatter

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -2
  2. Dockerfile +16 -6
  3. README.md +11 -6
  4. generator.py +48 -49
  5. main.py +31 -31
  6. requirements.txt +5 -5
.gitattributes CHANGED
@@ -1,4 +1,4 @@
1
- * text=auto
2
  *.md text eol=lf
3
  *.py text eol=lf
4
- Dockerfile text eol=lf
 
1
+ * text=auto eol=lf
2
  *.md text eol=lf
3
  *.py text eol=lf
4
+ Dockerfile text eol=lf
Dockerfile CHANGED
@@ -1,7 +1,17 @@
1
- FROM python:3.10-slim
 
 
 
 
 
 
 
2
  WORKDIR /app
3
- COPY requirements.txt .
4
- RUN pip install --no-cache-dir -r requirements.txt
5
- COPY . .
6
- ENV PORT=7860
7
- CMD ["python", "main.py"]
 
 
 
 
1
+ # Dockerfile
2
+ FROM python:3.11-slim
3
+
4
+ ENV PYTHONDONTWRITEBYTECODE=1 \
5
+ PYTHONUNBUFFERED=1 \
6
+ PIP_NO_CACHE_DIR=1 \
7
+ PORT=7860
8
+
9
  WORKDIR /app
10
+
11
+ COPY requirements.txt /app/requirements.txt
12
+ RUN pip install --no-cache-dir -r /app/requirements.txt
13
+
14
+ COPY . /app
15
+
16
+ EXPOSE 7860
17
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,13 +1,18 @@
1
- ---
2
  title: PDF Q&A Generator
3
- emoji:
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
  # PDF Q&A Generator
11
 
12
- FastAPI service that turns a PDF into Q&A pairs.
13
- Endpoints: `/`, `/health`, `/generate` (multipart file `file`, param `num_questions`).
 
 
 
 
 
 
1
+ ---
2
  title: PDF Q&A Generator
3
+ emoji: 🧩
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
  # PDF Q&A Generator
11
 
12
+ Convert any PDF into a small Q&A dataset (no external API keys).
13
+
14
+ ## Usage
15
+ - `POST /generate` with a PDF file (`file`) and optional `num_questions` (default 10)
16
+ - `GET /health` for status
17
+
18
+ This Space uses **docker** and serves FastAPI on port **7860**.
generator.py CHANGED
@@ -1,57 +1,56 @@
1
- import os, re
2
  from typing import List, Dict
3
 
4
  class QAGenerator:
5
- def __init__(self):
6
- self.use_groq = False
7
- try:
8
- from langchain_groq import ChatGroq # noqa
9
- key = os.getenv("GROQ_API_KEY")
10
- if key:
11
- self.ChatGroq = ChatGroq
12
- self.llm = ChatGroq(api_key=key, model="llama-3.1-70b-versatile")
13
- self.use_groq = True
14
- except Exception:
15
- self.use_groq = False
 
 
 
 
 
 
 
16
 
17
- def chunk_text(self, text: str, max_chars: int = 1200) -> List[str]:
18
- text = re.sub(r"\s+", " ", text)
19
- chunks = []
20
- for i in range(0, len(text), max_chars):
21
- c = text[i:i+max_chars].strip()
22
- if len(c) > 200:
23
- chunks.append(c)
24
- return chunks
 
 
25
 
26
- def _gen_one_llm(self, chunk: str):
27
- q = self.llm.invoke(
28
- f"Write ONE specific, clear question about this text:\n\n{chunk}\n\nQuestion:"
29
- ).content.strip()
30
- a = self.llm.invoke(
31
- f"Answer the question using ONLY the text.\n\nText:\n{chunk}\n\nQuestion: {q}\n\nAnswer:"
32
- ).content.strip()
33
- return q, a
34
-
35
- def _gen_one_heuristic(self, chunk: str):
36
  sents = re.split(r'(?<=[.!?])\s+', chunk)
37
- first = (sents[0] if sents else chunk)[:180]
38
- q = f"What is the main point of: \"{first}...\"?"
39
- a = first.strip()
40
- return q, a
41
 
42
- def generate(self, text: str, n: int = 10) -> List[Dict]:
43
- pairs = []
44
- for i, c in enumerate(self.chunk_text(text)):
45
- if len(pairs) >= n: break
46
- if self.use_groq:
47
- try:
48
- q, a = self._gen_one_llm(c)
49
- except Exception:
50
- q, a = self._gen_one_heuristic(c)
51
- else:
52
- q, a = self._gen_one_heuristic(c)
53
- pairs.append({
54
- "id": i+1, "question": q.strip(), "answer": a.strip(),
55
- "source_excerpt": (c[:200] + "...") if len(c) > 200 else c
56
  })
57
- return pairs
 
1
+ # generator.py
2
  from typing import List, Dict
3
 
4
  class QAGenerator:
5
+ """
6
+ Deterministik, sadə Q&A generatoru (LLM-siz).
7
+ Məqsəd: Space-in problemsiz ayağa qalxması və sürətli demo.
8
+ """
9
+ def chunk_text(self, text: str, min_len: int = 200, max_len: int = 1200) -> List[str]:
10
+ # Sətirləri birləşdir, çox qısa hissələri at
11
+ import re
12
+ cleaned = re.sub(r'\s+', ' ', text).strip()
13
+ if not cleaned:
14
+ return []
15
+ # Sadə söz əsaslı bölmə
16
+ words = cleaned.split()
17
+ chunks, step = [], 220
18
+ for i in range(0, len(words), step):
19
+ part = " ".join(words[i:i+step])
20
+ if len(part) >= min_len:
21
+ chunks.append(part[:max_len])
22
+ return chunks or ([cleaned[:max_len]] if cleaned else [])
23
 
24
+ def make_question(self, chunk: str, idx: int) -> str:
25
+ # Yüngül sual şablonları (deterministik)
26
+ templates = [
27
+ "What is the main idea of this section?",
28
+ "List the key points mentioned here.",
29
+ "Summarize the main purpose discussed.",
30
+ "What steps or procedures are described?",
31
+ "Which entities or tools are referenced?"
32
+ ]
33
+ return templates[idx % len(templates)]
34
 
35
+ def make_answer(self, chunk: str) -> str:
36
+ # Heuristik cavab: chunk-ın ilk 1-2 cümləsini qaytar
37
+ import re
 
 
 
 
 
 
 
38
  sents = re.split(r'(?<=[.!?])\s+', chunk)
39
+ head = " ".join(sents[:2]).strip()
40
+ return head if head else chunk[:300]
 
 
41
 
42
+ def generate(self, text: str, num_questions: int = 10) -> List[Dict]:
43
+ chunks = self.chunk_text(text)
44
+ if not chunks:
45
+ return []
46
+ out = []
47
+ for i, chunk in enumerate(chunks[:max(1, num_questions)]):
48
+ q = self.make_question(chunk, i)
49
+ a = self.make_answer(chunk)
50
+ out.append({
51
+ "id": i+1,
52
+ "question": q,
53
+ "answer": a,
54
+ "source_excerpt": chunk[:220] + ("..." if len(chunk) > 220 else "")
 
55
  })
56
+ return out
main.py CHANGED
@@ -1,32 +1,32 @@
1
- import os, io
2
  from fastapi import FastAPI, UploadFile, File, HTTPException
3
- from fastapi.middleware.cors import CORSMiddleware
 
 
 
 
4
  try:
5
  from pypdf import PdfReader
6
  except Exception:
7
- from PyPDF2 import PdfReader # fallback
8
 
9
  from generator import QAGenerator
10
 
11
  app = FastAPI(
12
  title="PDF Q&A Dataset Generator",
13
- description="Turn any PDF into Q&A pairs (LLM optional)",
14
  version="1.0.0",
15
  )
16
 
17
- app.add_middleware(
18
- CORSMiddleware,
19
- allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
20
- )
21
-
22
  gen = QAGenerator()
23
 
24
  @app.get("/")
25
  def root():
26
  return {
27
- "service": "PDF Q&A Dataset Generator",
28
  "endpoints": ["/generate", "/health"],
29
- "notes": "Set GROQ_API_KEY for LLM; otherwise uses heuristic fallback."
 
30
  }
31
 
32
  @app.get("/health")
@@ -36,29 +36,29 @@ def health():
36
  @app.post("/generate")
37
  async def generate_dataset(file: UploadFile = File(...), num_questions: int = 10):
38
  if not file.filename.lower().endswith(".pdf"):
39
- raise HTTPException(400, "Only PDF files are supported")
40
- data = await file.read()
41
- reader = PdfReader(io.BytesIO(data))
42
- text = []
43
- for p in reader.pages:
44
- try:
 
 
 
45
  t = p.extract_text() or ""
46
- except Exception:
47
- t = ""
48
- if t.strip():
49
- text.append(t)
50
- full = "\n".join(text).strip()
51
- if not full:
52
- raise HTTPException(400, "No extractable text found in PDF")
53
 
54
- qa = gen.generate(full, num_questions)
 
 
 
55
  return {
56
  "filename": file.filename,
57
  "qa_count": len(qa),
58
  "dataset": qa,
59
- "export_formats": ["json", "csv", "jsonl"]
60
- }
61
-
62
- if __name__ == "__main__":
63
- import uvicorn
64
- uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))
 
1
+ # main.py
2
  from fastapi import FastAPI, UploadFile, File, HTTPException
3
+ from fastapi.responses import JSONResponse
4
+ import io
5
+ import os
6
+
7
+ # Prefer pypdf, fallback PyPDF2
8
  try:
9
  from pypdf import PdfReader
10
  except Exception:
11
+ from PyPDF2 import PdfReader # type: ignore
12
 
13
  from generator import QAGenerator
14
 
15
  app = FastAPI(
16
  title="PDF Q&A Dataset Generator",
17
+ description="Turn a PDF into a small Q&A dataset for demos.",
18
  version="1.0.0",
19
  )
20
 
 
 
 
 
 
21
  gen = QAGenerator()
22
 
23
  @app.get("/")
24
  def root():
25
  return {
26
+ "service": "PDF-QA-Generator",
27
  "endpoints": ["/generate", "/health"],
28
+ "sdk": "docker",
29
+ "port_note": "Container listens on $PORT (default 7860)",
30
  }
31
 
32
  @app.get("/health")
 
36
  @app.post("/generate")
37
  async def generate_dataset(file: UploadFile = File(...), num_questions: int = 10):
38
  if not file.filename.lower().endswith(".pdf"):
39
+ raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
40
+ pdf_bytes = await file.read()
41
+ if not pdf_bytes:
42
+ raise HTTPException(status_code=400, detail="Empty file.")
43
+
44
+ try:
45
+ reader = PdfReader(io.BytesIO(pdf_bytes))
46
+ text_parts = []
47
+ for p in reader.pages:
48
  t = p.extract_text() or ""
49
+ if t.strip():
50
+ text_parts.append(t)
51
+ text = "\n".join(text_parts).strip()
52
+ except Exception as e:
53
+ raise HTTPException(status_code=500, detail=f"PDF parse error: {e}")
 
 
54
 
55
+ if not text:
56
+ return {"filename": file.filename, "qa_count": 0, "dataset": []}
57
+
58
+ qa = gen.generate(text, num_questions=num_questions)
59
  return {
60
  "filename": file.filename,
61
  "qa_count": len(qa),
62
  "dataset": qa,
63
+ "export_formats": ["json", "jsonl", "csv (client-side)"]
64
+ }
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- fastapi
2
- uvicorn
3
- pypdf
4
- PyPDF2
5
- langchain-groq
 
1
+ fastapi>=0.110
2
+ uvicorn[standard]>=0.30
3
+ pypdf2>=3.0
4
+ pypdf>=4.2
5
+ python-multipart>=0.0.9