Spaces:

ZombitX64
/

AutoGDataset

Paused

App Files Files Community

Nattapong Tapachoom commited on Sep 1, 2025

Commit

230725f

1 Parent(s): 75d098a

Enhance README and app.py for PDF to QA dataset generation; add requirements.txt

Browse files

Files changed (4) hide show

README.md +17 -0
__pycache__/app.cpython-313.pyc +0 -0
app.py +284 -4
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -10,3 +10,20 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## LangChain + HF Inference
+This app uses LangChain with the Hugging Face Inference API to generate QA datasets from PDFs.
+- Preset models: `HuggingFaceH4/zephyr-7b-beta`, `mistralai/Mistral-7B-Instruct-v0.2`, `google/flan-t5-large`.
+- Provide an `HF_TOKEN` (environment or UI) if your chosen model requires authentication.
+## Usage
+- Run locally: `pip install -r requirements.txt` then `python app.py` and open the link. Upload one or more PDFs, choose the inference method, and click Generate.
+- On Spaces: add a secret `HF_TOKEN` if your chosen model requires it; or paste it in the UI when running.
+### Notes
+- Uses HF Inference API via LangChain; no local `transformers` needed.
+- Output files are saved to `outputs/` as JSON and JSONL.

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (15 kB). View file

app.py CHANGED Viewed

@@ -1,7 +1,287 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import io
+import re
+import json
+from datetime import datetime
+from typing import List, Dict, Any, Tuple
 import gradio as gr
+try:
+    from pypdf import PdfReader
+except Exception:  # pragma: no cover - lazy import warning only
+    PdfReader = None  # type: ignore
+# LangChain components
+try:
+    from langchain_core.prompts import PromptTemplate
+    from langchain_core.output_parsers import JsonOutputParser
+    from langchain_community.llms import HuggingFaceHub
+except Exception:
+    PromptTemplate = None  # type: ignore
+    JsonOutputParser = None  # type: ignore
+    HuggingFaceHub = None  # type: ignore
+def ensure_output_dir() -> str:
+    outdir = os.path.join(os.getcwd(), "outputs")
+    os.makedirs(outdir, exist_ok=True)
+    return outdir
+def read_pdfs(files: List[gr.File]) -> Tuple[str, List[Dict[str, Any]]]:
+    if not files:
+        return "", []
+    if PdfReader is None:
+        raise RuntimeError("pypdf is not installed. Please add it to requirements.txt or pip install pypdf.")
+    docs = []
+    combined_text_parts: List[str] = []
+    for f in files:
+        path = f.name if hasattr(f, "name") else f
+        reader = PdfReader(path)
+        pages_text = []
+        for i, page in enumerate(reader.pages):
+            try:
+                text = page.extract_text() or ""
+            except Exception:
+                text = ""
+            # Normalize whitespace
+            text = re.sub(r"\s+", " ", text).strip()
+            if text:
+                pages_text.append({"page": i + 1, "text": text})
+                combined_text_parts.append(text)
+        docs.append({"file": os.path.basename(path), "pages": pages_text})
+    combined_text = "\n\n".join(combined_text_parts)
+    return combined_text, docs
+def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200, max_chunks: int = 5) -> List[Dict[str, Any]]:
+    text = text.strip()
+    if not text:
+        return []
+    chunks: List[Dict[str, Any]] = []
+    start = 0
+    n = len(text)
+    while start < n and len(chunks) < max_chunks:
+        end = min(start + chunk_size, n)
+        chunk = text[start:end]
+        # try to end on a sentence boundary
+        if end < n:
+            m = re.search(r"[\.!?]\s", text[end - 200:end] if end - 200 > start else text[start:end])
+            if m:
+                end = start + (m.end())
+                chunk = text[start:end]
+        chunks.append({"index": len(chunks), "start": start, "end": end, "text": chunk})
+        if end >= n:
+            break
+        start = max(end - overlap, 0)
+        if start == end:  # safety
+            start += 1
+    return chunks
+DEFAULT_QA_PROMPT_TMPL = (
+    'You are a helpful dataset creator. Read the provided content and generate between {min_pairs} and {max_pairs} high-quality, factual question-answer pairs. '
+    'Return ONLY a JSON array with objects of the form {"question": str, "answer": str}. Do not include any extra text, comments, or code fences.\n\n'
+    'Content:\n{content}\n'
+)
+def extract_json_array(text: str) -> List[Dict[str, Any]]:
+    if not text:
+        return []
+    # Remove code fences
+    text = re.sub(r"```[a-zA-Z]*", "```", text)
+    text = text.replace("```", "")
+    # Find first [ ... ] block
+    start = text.find("[")
+    end = text.rfind("]")
+    if start != -1 and end != -1 and end > start:
+        candidate = text[start : end + 1]
+    else:
+        candidate = text
+    try:
+        data = json.loads(candidate)
+        if isinstance(data, list):
+            # normalize
+            norm = []
+            for item in data:
+                if not isinstance(item, dict):
+                    continue
+                q = str(item.get("question", "").strip())
+                a = str(item.get("answer", "").strip())
+                if q and a:
+                    norm.append({"question": q, "answer": a})
+            return norm
+    except Exception:
+        pass
+    return []
+def build_langchain(model_id: str, hf_token: str | None, max_new_tokens: int, temperature: float, custom_instruction: str | None, min_pairs: int, max_pairs: int):
+    if any(x is None for x in [PromptTemplate, JsonOutputParser, HuggingFaceHub]):
+        raise RuntimeError("langchain and langchain-community are required. Please add to requirements.txt.")
+    # Prompt
+    template = custom_instruction.strip() + "\n\nContent:\n{content}\n" if (custom_instruction and custom_instruction.strip()) else DEFAULT_QA_PROMPT_TMPL
+    prompt = PromptTemplate.from_template(template)
+    # Model wrapper (Hugging Face Inference API)
+    llm = HuggingFaceHub(
+        repo_id=model_id,
+        huggingfacehub_api_token=hf_token,
+        model_kwargs={
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "do_sample": temperature > 0.0,
+        },
+    )
+    parser = JsonOutputParser()
+    chain = prompt | llm | parser
+    # Provide default formatting variables via partials
+    chain = chain.bind(min_pairs=min_pairs, max_pairs=max_pairs)
+    return chain
+def generate_dataset(
+    files: List[gr.File],
+    preset_model: str,
+    custom_model_id: str,
+    hf_token: str,
+    chunk_size: int,
+    overlap: int,
+    max_chunks: int,
+    max_new_tokens: int,
+    temperature: float,
+    custom_instruction: str,
+    min_pairs: int,
+    max_pairs: int,
+):
+    # Read and chunk
+    full_text, _docs = read_pdfs(files)
+    chunks = chunk_text(full_text, chunk_size=chunk_size, overlap=overlap, max_chunks=max_chunks)
+    if not chunks:
+        return "No text extracted from PDF(s).", None, None
+    model_id = (custom_model_id or "").strip() or preset_model
+    try:
+        chain = build_langchain(model_id, hf_token or None, max_new_tokens, temperature, custom_instruction, min_pairs, max_pairs)
+    except Exception as e:
+        return f"Error preparing LangChain: {e}", None, None
+    results: List[Dict[str, Any]] = []
+    for ch in chunks:
+        try:
+            data = chain.invoke({"content": ch["text"]})
+            if isinstance(data, list):
+                items = data
+            else:
+                items = extract_json_array(str(data))
+        except Exception:
+            # If parser fails, try best-effort extraction on raw string
+            try:
+                from langchain_core.runnables import Runnable
+                raw = (PromptTemplate.from_template(DEFAULT_QA_PROMPT_TMPL) | HuggingFaceHub(repo_id=model_id, huggingfacehub_api_token=hf_token)).invoke({"content": ch["text"], "min_pairs": min_pairs, "max_pairs": max_pairs})  # type: ignore
+                items = extract_json_array(str(raw))
+            except Exception:
+                items = []
+        for it in items:
+            if isinstance(it, dict) and it.get("question") and it.get("answer"):
+                it["context"] = (ch["text"][:500] + ("..." if len(ch["text"]) > 500 else ""))
+                results.append(it)
+    if not results:
+        return "Model did not return any valid QA pairs. Try adjusting prompt or model.", None, None
+    # Deduplicate by question
+    seen = set()
+    unique = []
+    for r in results:
+        q = r.get("question", "").strip()
+        if q and q.lower() not in seen:
+            unique.append(r)
+            seen.add(q.lower())
+    # Save to outputs
+    outdir = ensure_output_dir()
+    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+    json_path = os.path.join(outdir, f"dataset_qa_{ts}.json")
+    jsonl_path = os.path.join(outdir, f"dataset_qa_{ts}.jsonl")
+    with io.open(json_path, "w", encoding="utf-8") as f:
+        json.dump(unique, f, ensure_ascii=False, indent=2)
+    with io.open(jsonl_path, "w", encoding="utf-8") as f:
+        for item in unique:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+    return f"Generated {len(unique)} QA pairs.", json_path, jsonl_path
+PRESET_MODELS = [
+    "HuggingFaceH4/zephyr-7b-beta",
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    "google/flan-t5-large",
+]
+with gr.Blocks(title="AutoGDataset - PDF to QA Dataset (LangChain)") as demo:
+    gr.Markdown("""
+    # AutoGDataset
+    Generate QA datasets from PDFs using LangChain with Hugging Face models (Inference API).
+    Choose one of the preset models or provide a custom repo id. Provide a valid `HF_TOKEN` if required by the model.
+    """)
+    with gr.Row():
+        pdf_files = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
+    with gr.Group():
+        with gr.Row():
+            preset_model = gr.Dropdown(label="Preset Model", choices=PRESET_MODELS, value=PRESET_MODELS[0])
+            custom_model_id = gr.Textbox(label="Custom Model ID (optional)", placeholder="org/model-name")
+        with gr.Row():
+            hf_token = gr.Textbox(label="HF Token", type="password", value=os.environ.get("HF_TOKEN", ""), placeholder="hf_xxx (required for many models)")
+        with gr.Row():
+            max_new_tokens = gr.Slider(64, 1024, value=512, step=16, label="Max New Tokens")
+            temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
+    with gr.Accordion("Advanced", open=False):
+        with gr.Row():
+            chunk_size = gr.Slider(500, 4000, value=1500, step=50, label="Chunk Size (chars)")
+            overlap = gr.Slider(0, 1000, value=200, step=50, label="Overlap (chars)")
+            max_chunks = gr.Slider(1, 40, value=5, step=1, label="Max Chunks")
+        with gr.Row():
+            min_pairs = gr.Slider(1, 10, value=3, step=1, label="Min Pairs/Chunk")
+            max_pairs = gr.Slider(1, 12, value=6, step=1, label="Max Pairs/Chunk")
+        custom_instruction = gr.Textbox(label="Custom Instruction (optional)", lines=3, placeholder="Override default instruction. Must ask for a pure JSON array of {question, answer}.")
+    generate_btn = gr.Button("Generate Dataset", variant="primary")
+    with gr.Row():
+        status = gr.Markdown()
+    with gr.Row():
+        out_json = gr.File(label="Download JSON")
+        out_jsonl = gr.File(label="Download JSONL")
+    generate_btn.click(
+        fn=generate_dataset,
+        inputs=[
+            pdf_files,
+            preset_model,
+            custom_model_id,
+            hf_token,
+            chunk_size,
+            overlap,
+            max_chunks,
+            max_new_tokens,
+            temperature,
+            custom_instruction,
+            min_pairs,
+            max_pairs,
+        ],
+        outputs=[status, out_json, out_jsonl],
+        show_progress=True,
+        api_name="generate",
+    )
+if __name__ == "__main__":
+    # For local runs
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==5.44.1
+pypdf>=4.2.0
+huggingface_hub>=0.23.0
+langchain>=0.2.0
+langchain-community>=0.2.0