Nattapong Tapachoom commited on
Commit
230725f
·
1 Parent(s): 75d098a

Enhance README and app.py for PDF to QA dataset generation; add requirements.txt

Browse files
Files changed (4) hide show
  1. README.md +17 -0
  2. __pycache__/app.cpython-313.pyc +0 -0
  3. app.py +284 -4
  4. requirements.txt +5 -0
README.md CHANGED
@@ -10,3 +10,20 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ ## LangChain + HF Inference
15
+
16
+ This app uses LangChain with the Hugging Face Inference API to generate QA datasets from PDFs.
17
+
18
+ - Preset models: `HuggingFaceH4/zephyr-7b-beta`, `mistralai/Mistral-7B-Instruct-v0.2`, `google/flan-t5-large`.
19
+ - Provide an `HF_TOKEN` (environment or UI) if your chosen model requires authentication.
20
+
21
+ ## Usage
22
+
23
+ - Run locally: `pip install -r requirements.txt` then `python app.py` and open the link. Upload one or more PDFs, choose the inference method, and click Generate.
24
+ - On Spaces: add a secret `HF_TOKEN` if your chosen model requires it; or paste it in the UI when running.
25
+
26
+ ### Notes
27
+
28
+ - Uses HF Inference API via LangChain; no local `transformers` needed.
29
+ - Output files are saved to `outputs/` as JSON and JSONL.
__pycache__/app.cpython-313.pyc ADDED
Binary file (15 kB). View file
 
app.py CHANGED
@@ -1,7 +1,287 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
1
+ import os
2
+ import io
3
+ import re
4
+ import json
5
+ from datetime import datetime
6
+ from typing import List, Dict, Any, Tuple
7
+
8
  import gradio as gr
9
 
10
+ try:
11
+ from pypdf import PdfReader
12
+ except Exception: # pragma: no cover - lazy import warning only
13
+ PdfReader = None # type: ignore
14
+
15
+ # LangChain components
16
+ try:
17
+ from langchain_core.prompts import PromptTemplate
18
+ from langchain_core.output_parsers import JsonOutputParser
19
+ from langchain_community.llms import HuggingFaceHub
20
+ except Exception:
21
+ PromptTemplate = None # type: ignore
22
+ JsonOutputParser = None # type: ignore
23
+ HuggingFaceHub = None # type: ignore
24
+
25
+
26
+ def ensure_output_dir() -> str:
27
+ outdir = os.path.join(os.getcwd(), "outputs")
28
+ os.makedirs(outdir, exist_ok=True)
29
+ return outdir
30
+
31
+
32
+ def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
33
+ if not files:
34
+ return "", []
35
+ if PdfReader is None:
36
+ raise RuntimeError("pypdf is not installed. Please add it to requirements.txt or pip install pypdf.")
37
+
38
+ docs = []
39
+ combined_text_parts: List[str] = []
40
+ for f in files:
41
+ path = f.name if hasattr(f, "name") else f
42
+ reader = PdfReader(path)
43
+ pages_text = []
44
+ for i, page in enumerate(reader.pages):
45
+ try:
46
+ text = page.extract_text() or ""
47
+ except Exception:
48
+ text = ""
49
+ # Normalize whitespace
50
+ text = re.sub(r"\s+", " ", text).strip()
51
+ if text:
52
+ pages_text.append({"page": i + 1, "text": text})
53
+ combined_text_parts.append(text)
54
+ docs.append({"file": os.path.basename(path), "pages": pages_text})
55
+ combined_text = "\n\n".join(combined_text_parts)
56
+ return combined_text, docs
57
+
58
+
59
+ def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200, max_chunks: int = 5) -> List[Dict[str, Any]]:
60
+ text = text.strip()
61
+ if not text:
62
+ return []
63
+ chunks: List[Dict[str, Any]] = []
64
+ start = 0
65
+ n = len(text)
66
+ while start < n and len(chunks) < max_chunks:
67
+ end = min(start + chunk_size, n)
68
+ chunk = text[start:end]
69
+ # try to end on a sentence boundary
70
+ if end < n:
71
+ m = re.search(r"[\.!?]\s", text[end - 200:end] if end - 200 > start else text[start:end])
72
+ if m:
73
+ end = start + (m.end())
74
+ chunk = text[start:end]
75
+ chunks.append({"index": len(chunks), "start": start, "end": end, "text": chunk})
76
+ if end >= n:
77
+ break
78
+ start = max(end - overlap, 0)
79
+ if start == end: # safety
80
+ start += 1
81
+ return chunks
82
+
83
+
84
+ DEFAULT_QA_PROMPT_TMPL = (
85
+ 'You are a helpful dataset creator. Read the provided content and generate between {min_pairs} and {max_pairs} high-quality, factual question-answer pairs. '
86
+ 'Return ONLY a JSON array with objects of the form {"question": str, "answer": str}. Do not include any extra text, comments, or code fences.\n\n'
87
+ 'Content:\n{content}\n'
88
+ )
89
+
90
+
91
+ def extract_json_array(text: str) -> List[Dict[str, Any]]:
92
+ if not text:
93
+ return []
94
+ # Remove code fences
95
+ text = re.sub(r"```[a-zA-Z]*", "```", text)
96
+ text = text.replace("```", "")
97
+ # Find first [ ... ] block
98
+ start = text.find("[")
99
+ end = text.rfind("]")
100
+ if start != -1 and end != -1 and end > start:
101
+ candidate = text[start : end + 1]
102
+ else:
103
+ candidate = text
104
+ try:
105
+ data = json.loads(candidate)
106
+ if isinstance(data, list):
107
+ # normalize
108
+ norm = []
109
+ for item in data:
110
+ if not isinstance(item, dict):
111
+ continue
112
+ q = str(item.get("question", "").strip())
113
+ a = str(item.get("answer", "").strip())
114
+ if q and a:
115
+ norm.append({"question": q, "answer": a})
116
+ return norm
117
+ except Exception:
118
+ pass
119
+ return []
120
+
121
+
122
+ def build_langchain(model_id: str, hf_token: str | None, max_new_tokens: int, temperature: float, custom_instruction: str | None, min_pairs: int, max_pairs: int):
123
+ if any(x is None for x in [PromptTemplate, JsonOutputParser, HuggingFaceHub]):
124
+ raise RuntimeError("langchain and langchain-community are required. Please add to requirements.txt.")
125
+ # Prompt
126
+ template = custom_instruction.strip() + "\n\nContent:\n{content}\n" if (custom_instruction and custom_instruction.strip()) else DEFAULT_QA_PROMPT_TMPL
127
+ prompt = PromptTemplate.from_template(template)
128
+ # Model wrapper (Hugging Face Inference API)
129
+ llm = HuggingFaceHub(
130
+ repo_id=model_id,
131
+ huggingfacehub_api_token=hf_token,
132
+ model_kwargs={
133
+ "max_new_tokens": max_new_tokens,
134
+ "temperature": temperature,
135
+ "do_sample": temperature > 0.0,
136
+ },
137
+ )
138
+ parser = JsonOutputParser()
139
+ chain = prompt | llm | parser
140
+ # Provide default formatting variables via partials
141
+ chain = chain.bind(min_pairs=min_pairs, max_pairs=max_pairs)
142
+ return chain
143
+
144
+
145
+ def generate_dataset(
146
+ files: List[gr.File],
147
+ preset_model: str,
148
+ custom_model_id: str,
149
+ hf_token: str,
150
+ chunk_size: int,
151
+ overlap: int,
152
+ max_chunks: int,
153
+ max_new_tokens: int,
154
+ temperature: float,
155
+ custom_instruction: str,
156
+ min_pairs: int,
157
+ max_pairs: int,
158
+ ):
159
+ # Read and chunk
160
+ full_text, _docs = read_pdfs(files)
161
+ chunks = chunk_text(full_text, chunk_size=chunk_size, overlap=overlap, max_chunks=max_chunks)
162
+ if not chunks:
163
+ return "No text extracted from PDF(s).", None, None
164
+
165
+ model_id = (custom_model_id or "").strip() or preset_model
166
+ try:
167
+ chain = build_langchain(model_id, hf_token or None, max_new_tokens, temperature, custom_instruction, min_pairs, max_pairs)
168
+ except Exception as e:
169
+ return f"Error preparing LangChain: {e}", None, None
170
+
171
+ results: List[Dict[str, Any]] = []
172
+ for ch in chunks:
173
+ try:
174
+ data = chain.invoke({"content": ch["text"]})
175
+ if isinstance(data, list):
176
+ items = data
177
+ else:
178
+ items = extract_json_array(str(data))
179
+ except Exception:
180
+ # If parser fails, try best-effort extraction on raw string
181
+ try:
182
+ from langchain_core.runnables import Runnable
183
+ raw = (PromptTemplate.from_template(DEFAULT_QA_PROMPT_TMPL) | HuggingFaceHub(repo_id=model_id, huggingfacehub_api_token=hf_token)).invoke({"content": ch["text"], "min_pairs": min_pairs, "max_pairs": max_pairs}) # type: ignore
184
+ items = extract_json_array(str(raw))
185
+ except Exception:
186
+ items = []
187
+
188
+ for it in items:
189
+ if isinstance(it, dict) and it.get("question") and it.get("answer"):
190
+ it["context"] = (ch["text"][:500] + ("..." if len(ch["text"]) > 500 else ""))
191
+ results.append(it)
192
+
193
+ if not results:
194
+ return "Model did not return any valid QA pairs. Try adjusting prompt or model.", None, None
195
+
196
+ # Deduplicate by question
197
+ seen = set()
198
+ unique = []
199
+ for r in results:
200
+ q = r.get("question", "").strip()
201
+ if q and q.lower() not in seen:
202
+ unique.append(r)
203
+ seen.add(q.lower())
204
+
205
+ # Save to outputs
206
+ outdir = ensure_output_dir()
207
+ ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
208
+ json_path = os.path.join(outdir, f"dataset_qa_{ts}.json")
209
+ jsonl_path = os.path.join(outdir, f"dataset_qa_{ts}.jsonl")
210
+ with io.open(json_path, "w", encoding="utf-8") as f:
211
+ json.dump(unique, f, ensure_ascii=False, indent=2)
212
+ with io.open(jsonl_path, "w", encoding="utf-8") as f:
213
+ for item in unique:
214
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
215
+
216
+ return f"Generated {len(unique)} QA pairs.", json_path, jsonl_path
217
+
218
+
219
+ PRESET_MODELS = [
220
+ "HuggingFaceH4/zephyr-7b-beta",
221
+ "mistralai/Mistral-7B-Instruct-v0.2",
222
+ "google/flan-t5-large",
223
+ ]
224
+
225
+
226
+ with gr.Blocks(title="AutoGDataset - PDF to QA Dataset (LangChain)") as demo:
227
+ gr.Markdown("""
228
+ # AutoGDataset
229
+ Generate QA datasets from PDFs using LangChain with Hugging Face models (Inference API).
230
+ Choose one of the preset models or provide a custom repo id. Provide a valid `HF_TOKEN` if required by the model.
231
+ """)
232
+
233
+ with gr.Row():
234
+ pdf_files = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
235
+
236
+ with gr.Group():
237
+ with gr.Row():
238
+ preset_model = gr.Dropdown(label="Preset Model", choices=PRESET_MODELS, value=PRESET_MODELS[0])
239
+ custom_model_id = gr.Textbox(label="Custom Model ID (optional)", placeholder="org/model-name")
240
+ with gr.Row():
241
+ hf_token = gr.Textbox(label="HF Token", type="password", value=os.environ.get("HF_TOKEN", ""), placeholder="hf_xxx (required for many models)")
242
+ with gr.Row():
243
+ max_new_tokens = gr.Slider(64, 1024, value=512, step=16, label="Max New Tokens")
244
+ temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
245
+
246
+ with gr.Accordion("Advanced", open=False):
247
+ with gr.Row():
248
+ chunk_size = gr.Slider(500, 4000, value=1500, step=50, label="Chunk Size (chars)")
249
+ overlap = gr.Slider(0, 1000, value=200, step=50, label="Overlap (chars)")
250
+ max_chunks = gr.Slider(1, 40, value=5, step=1, label="Max Chunks")
251
+ with gr.Row():
252
+ min_pairs = gr.Slider(1, 10, value=3, step=1, label="Min Pairs/Chunk")
253
+ max_pairs = gr.Slider(1, 12, value=6, step=1, label="Max Pairs/Chunk")
254
+ custom_instruction = gr.Textbox(label="Custom Instruction (optional)", lines=3, placeholder="Override default instruction. Must ask for a pure JSON array of {question, answer}.")
255
+
256
+ generate_btn = gr.Button("Generate Dataset", variant="primary")
257
+
258
+ with gr.Row():
259
+ status = gr.Markdown()
260
+ with gr.Row():
261
+ out_json = gr.File(label="Download JSON")
262
+ out_jsonl = gr.File(label="Download JSONL")
263
+
264
+ generate_btn.click(
265
+ fn=generate_dataset,
266
+ inputs=[
267
+ pdf_files,
268
+ preset_model,
269
+ custom_model_id,
270
+ hf_token,
271
+ chunk_size,
272
+ overlap,
273
+ max_chunks,
274
+ max_new_tokens,
275
+ temperature,
276
+ custom_instruction,
277
+ min_pairs,
278
+ max_pairs,
279
+ ],
280
+ outputs=[status, out_json, out_jsonl],
281
+ show_progress=True,
282
+ api_name="generate",
283
+ )
284
 
285
+ if __name__ == "__main__":
286
+ # For local runs
287
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.44.1
2
+ pypdf>=4.2.0
3
+ huggingface_hub>=0.23.0
4
+ langchain>=0.2.0
5
+ langchain-community>=0.2.0