Spaces:

Barat123
/

STASS

Sleeping

App Files Files Community

Upload folder using huggingface_hub

by Harikrishna-Srinivasan - opened Mar 6

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+441

-96

Files changed (2) hide show

app.py +430 -92
requirements.txt +11 -4

app.py CHANGED Viewed

@@ -1,92 +1,430 @@
-import gradio as gr
-import torch
-import os
-from transformers import pipeline
-from huggingface_hub import login
-login(token=os.getenv("hf_tok"))
-MODEL_NAME = "google/gemma-2b-it"
-generator = pipeline(
-    "text-generation",
-    model=MODEL_NAME,
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-def build_prompt(syllabus, level, output_type):
-    instruction = ""
-    if output_type == "Lesson Material":
-        instruction = f"""Create detailed lesson material based on the syllabus below.
-Level: {level}
-Syllabus:
-{syllabus}
-Include:
-- Concept explanations
-- Key ideas
-- Examples
-- Classroom discussion points"""
-    elif output_type == "Questions and Answers":
-        instruction = f"""Generate 10 descriptive university-level questions with answers.
-Level: {level}
-Syllabus:
-{syllabus}
-Format clearly as:
-Question
-Answer"""
-    else:
-        instruction = f"""Generate 10 multiple choice questions.
-Level: {level}
-Syllabus:
-{syllabus}
-Format:
-Question
-A)
-B)
-C)
-D)
-Correct Answer:"""
-    messages = [
-        {"role": "system", "content": "You are an expert university teacher."},
-        {"role": "user", "content": instruction}
-    ]
-    return generator.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-def generate_content(syllabus, level, output_type):
-    prompt = build_prompt(syllabus, level, output_type)
-    output = generator(
-        prompt,
-        max_new_tokens=800,
-        temperature=0.7,
-        top_p=0.9,
-        do_sample=True,
-        pad_token_id=generator.tokenizer.eos_token_id
-    )
-    text = output[0]["generated_text"]
-    return text.replace(prompt, "").strip()
-with gr.Blocks() as demo:
-    gr.Markdown("# AI Syllabus Teaching Assistant")
-    syllabus = gr.Textbox(label="Paste your syllabus", lines=10)
-    level = gr.Dropdown(["School", "Undergraduate", "Postgraduate"], value="Undergraduate", label="Level")
-    output_type = gr.Radio(["Lesson Material", "Questions and Answers", "MCQs"], value="Lesson Material", label="Generate")
-    generate_btn = gr.Button("Generate")
-    output = gr.Textbox(label="AI Output", lines=20)
-    generate_btn.click(
-        generate_content,
-        inputs=[syllabus, level, output_type],
-        outputs=output
-    )
-demo.launch()

+import gradio as gr
+import os
+import threading
+import pathlib
+import base64
+import urllib.parse
+# ──────────────────────────────────────────────
+# FILE TEXT EXTRACTION
+# ──────────────────────────────────────────────
+SUPPORTED_EXT = (".pdf", ".docx", ".doc", ".txt",
+                 ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff")
+def extract_text_from_file(filepath: str) -> str:
+    """Extract plain text from PDF, DOCX, TXT, or image files."""
+    if not filepath:
+        return ""
+    ext = pathlib.Path(filepath).suffix.lower()
+    try:
+        # ── PDF ──
+        if ext == ".pdf":
+            import fitz  # pymupdf
+            doc = fitz.open(filepath)
+            return "\n".join(page.get_text() for page in doc).strip()
+        # ── Word (.docx / .doc) ──
+        elif ext in (".docx", ".doc"):
+            from docx import Document
+            doc = Document(filepath)
+            return "\n".join(p.text for p in doc.paragraphs if p.text.strip()).strip()
+        # ── Plain text ──
+        elif ext == ".txt":
+            with open(filepath, "r", encoding="utf-8", errors="replace") as f:
+                return f.read().strip()
+        # ── Images (OCR via pytesseract) ──
+        elif ext in (".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"):
+            try:
+                import pytesseract
+                from PIL import Image
+                img = Image.open(filepath)
+                return pytesseract.image_to_string(img).strip()
+            except Exception as ocr_err:
+                return (
+                    f"⚠️ OCR failed: {ocr_err}\n"
+                    "Ensure Tesseract-OCR is installed: https://github.com/UB-Mannheim/tesseract/wiki"
+                )
+        else:
+            return f"⚠️ Unsupported file type: {ext}"
+    except Exception as e:
+        return f"⚠️ Could not extract text: {e}"
+# ──────────────────────────────────────────────
+# MODEL CONFIGURATIONS  (all run via transformers pipeline)
+# ──────────────────────────────────────────────
+#
+# Grouped by RAM tier so users can pick what fits their machine.
+# Models are downloaded from HF Hub on first use and cached locally.
+MODELS = {
+    # ���─ Tier 1: Fast  (<2 GB RAM) ───────────────────────────
+    "⚡ Qwen2.5-0.5B  [~1 GB | Fastest]":       "Qwen/Qwen2.5-0.5B-Instruct",
+    "💫 Qwen2.5-1.5B  [~2 GB | Fast]":           "Qwen/Qwen2.5-1.5B-Instruct",
+    # ── Tier 2: Balanced  (2–4 GB RAM) ──────────────────────
+    "🔬 DeepSeek-R1-Distill 1.5B [~2 GB]":       "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "🦩 Llama-3.2-1B-Instruct [~2 GB]":          "meta-llama/Llama-3.2-1B-Instruct",
+    "🦩 Llama-3.2-3B-Instruct [~4 GB]":          "meta-llama/Llama-3.2-3B-Instruct",
+    "💶 Phi-3-mini-4k [~4 GB | Strong]":         "microsoft/Phi-3-mini-4k-instruct",
+    # ── Tier 3: Quality  (4–8 GB RAM) ───────────────────────
+    "💎 Gemma-2-2B-it [~3 GB | Google]":         "google/gemma-2-2b-it",
+    "🔥 Qwen2.5-3B  [~4 GB | Balanced]":         "Qwen/Qwen2.5-3B-Instruct",
+    "🥇 Llama-3.1-8B-Instruct [~8 GB]":          "meta-llama/Llama-3.1-8B-Instruct",
+    "🥇 Qwen2.5-7B  [~8 GB | Best quality]":     "Qwen/Qwen2.5-7B-Instruct",
+}
+ALL_MODEL_NAMES = list(MODELS.keys())
+# ──────────────────────────────────────────────
+# PIPELINE CACHE  (lazy-loaded, thread-safe)
+# ──────────────────────────────────────────────
+_pipeline_cache: dict = {}
+_pipeline_lock = threading.Lock()
+def get_pipeline(model_id: str, hf_token: str = ""):
+    """Download (on first use) and cache a transformers text-generation pipeline."""
+    with _pipeline_lock:
+        if model_id not in _pipeline_cache:
+            try:
+                from transformers import pipeline, AutoTokenizer
+                token = hf_token.strip() if hf_token else None
+                tok = AutoTokenizer.from_pretrained(model_id, token=token)
+                pipe = pipeline(
+                    "text-generation",
+                    model=model_id,
+                    tokenizer=tok,
+                    device_map="cpu",
+                    dtype="auto",
+                    trust_remote_code=True,
+                    token=token,
+                )
+                # Avoid conflict with max_length=20 default in some models
+                pipe.model.generation_config.max_length = None
+                _pipeline_cache[model_id] = pipe
+            except Exception as e:
+                return None, str(e)
+    return _pipeline_cache[model_id], None
+# ──────────────────────────────────────────────
+# INFERENCE
+# ──────────────────────────────────────────────
+SYSTEM_MSG = (
+    "You are an expert educational assistant. "
+    "Always respond with clean, well-structured Markdown text."
+)
+def ask_llm(model_label: str, prompt: str, hf_token: str = "") -> str:
+    """Run the prompt through the transformers pipeline for the selected model."""
+    model_id = MODELS[model_label]
+    pipe, err = get_pipeline(model_id, hf_token)
+    if err:
+        return (
+            f"❌ **Failed to load `{model_id}`:**\n```\n{err}\n```\n\n"
+            "*Tip: Check your internet connection or choose a smaller model.*"
+        )
+    try:
+        messages = [
+            {"role": "system", "content": SYSTEM_MSG},
+            {"role": "user",   "content": prompt},
+        ]
+        if pipe is None:
+            return "❌ **Pipeline error: Pipeline object is None.**"
+        # Pass generation params to the call to avoid constructor deprecation
+        out = pipe(
+            messages,
+            max_new_tokens=1024,
+            pad_token_id=pipe.tokenizer.eos_token_id if (pipe.tokenizer and pipe.tokenizer.eos_token_id is not None) else 50256
+        )
+        generated = out[0]["generated_text"]
+        if isinstance(generated, list):
+            # Chat-template output — last element is the assistant reply
+            return generated[-1]["content"]
+        # Plain-string fallback — strip the echoed prompt
+        return generated[len(str(messages)):].strip()
+    except Exception as e:
+        return f"❌ **Inference error:**\n```\n{str(e)}\n```"
+# ──────────────────────────────────────────────
+# PROMPTS
+# ──────────────────────────────────────────────
+def make_prompts(syllabus: str) -> dict:
+    return {
+        "lesson": (
+            f"Create comprehensive, engaging lesson materials for the following syllabus/topic. "
+            f"Use clear ## headings, bullet points, bold key terms, and concise explanations "
+            f"suitable for a student.\n\nSyllabus/Topic:\n{syllabus}"
+        ),
+        "qa": (
+            f"Generate 8 important exam-style questions with detailed model answers based on "
+            f"this syllabus/topic. Number each Q&A pair clearly.\n\nSyllabus/Topic:\n{syllabus}"
+        ),
+        "mcq": (
+            f"Generate 8 multiple-choice questions based on this syllabus/topic. "
+            f"Each question must have 4 options (A–D). After all questions, list the correct "
+            f"answers with a brief explanation.\n\nSyllabus/Topic:\n{syllabus}"
+        ),
+        "mindmap": (
+            f"Create a high-level Flowchart or Mindmap for the following syllabus/topic using Mermaid.js syntax.\n"
+            f"STRICT RULES:\n"
+            f"- Output ONLY the mermaid code block (```mermaid ... ```).\n"
+            f"- Use 'graph TD' (for flowcharts) or 'mindmap' structure.\n"
+            f"- This will be converted into a static picture, so keep labels clear.\n"
+            f"- No introductory text, no explanation outside the block.\n"
+            f"- Avoid special characters in node labels.\n\n"
+            f"Syllabus/Topic:\n{syllabus}"
+        ),
+        "infographic": (
+            f"Create a highly visual text-based cheat sheet / infographic for this syllabus/topic. "
+            f"Use emojis, ASCII section dividers, tables, bullet points, and bold highlights "
+            f"to make it easy to scan, remember, and share.\n\nSyllabus/Topic:\n{syllabus}"
+        ),
+    }
+# ──────────────────────────────────────────────
+# MAIN GENERATION FUNCTION (progressive generator)
+# ──────────────────────────────────────────────
+def render_mermaid_as_image(text: str) -> str:
+    """Extract Mermaid code block and convert it to a mermaid.ink image URL."""
+    import re
+    import json
+    # Look for ```mermaid ... ``` block
+    match = re.search(r'```mermaid\s+(.*?)\s+```', text, re.DOTALL)
+    if not match:
+        return text # Return raw text if no block is found
+    mermaid_code = match.group(1).strip()
+    # Base64 encode the code for mermaid.ink (requires JSON wrapping for the best compatibility)
+    try:
+        data = {
+            "code": mermaid_code,
+            "mermaid": {"theme": "default"},
+            "updateEditor": False,
+            "autoSync": True,
+            "updateDiagram": True
+        }
+        json_str = json.dumps(data)
+        encoded = base64.b64encode(json_str.encode('utf-8')).decode('utf-8')
+        image_url = f"https://mermaid.ink/img/{encoded}?type=webp"
+        # Return ONLY the image tag as requested ("picture only")
+        return f"![Flowchart/Mindmap]({image_url})"
+    except Exception as e:
+        return f"*⚠️ Failed to render flowchart as image: {e}*\n\n```mermaid\n{mermaid_code}\n```"
+def generate_content(syllabus_text: str, uploaded_file, model_label: str, hf_token: str):
+    # Merge pasted text + uploaded file text
+    file_text = extract_text_from_file(uploaded_file) if uploaded_file else ""
+    syllabus = (syllabus_text.strip() + "\n\n" + file_text).strip()
+    if not syllabus:
+        yield ("⚠️ Please paste a syllabus/topic **or** upload a file.", "", "", "", "")
+        return
+    model_id = MODELS[model_label]
+    mode_note = f"*Model: **`{model_id}`***"
+    prompts = make_prompts(syllabus)
+    WAIT = "⏳ Waiting…"
+    steps = [
+        ("📖 Generating Lesson Material… (1/5)", "lesson"),
+        ("❓ Generating Q&A… (2/5)",             "qa"),
+        ("✅ Generating MCQs… (3/5)",             "mcq"),
+        ("🧠 Generating Mindmap… (4/5)",          "mindmap"),
+        ("📊 Generating Cheat Sheet… (5/5)",      "infographic"),
+    ]
+    results = [mode_note + "\n\n" + steps[0][0], WAIT, WAIT, WAIT, WAIT]
+    yield tuple(results)
+    for i, (status_msg, key) in enumerate(steps):
+        result = ask_llm(model_label, prompts[key], hf_token)
+        # Post-process mindmap to purely render as an image URL
+        if key == "mindmap":
+            result = render_mermaid_as_image(result)
+        results[i] = mode_note + "\n\n" + result
+        if i + 1 < len(steps):
+            results[i + 1] = steps[i + 1][0]
+        yield tuple(results)
+# ──────────────────────────────────────────────
+# GRADIO UI
+# ──────────────────────────────────────────────
+CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
+body, .gradio-container {
+    font-family: 'Inter', sans-serif !important;
+}
+.app-header {
+    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
+    border-radius: 16px;
+    padding: 28px 32px;
+    margin-bottom: 8px;
+    border: 1px solid rgba(99,102,241,0.3);
+}
+.app-header h1 {
+    font-size: 2rem !important;
+    font-weight: 700 !important;
+    background: linear-gradient(90deg, #818cf8, #c084fc, #38bdf8);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    margin-bottom: 6px !important;
+}
+.app-header p {
+    color: #94a3b8 !important;
+    font-size: 0.95rem;
+}
+.generate-btn {
+    background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
+    border: none !important;
+    border-radius: 12px !important;
+    font-weight: 600 !important;
+    font-size: 1rem !important;
+    transition: all 0.2s ease !important;
+    width: 100% !important;
+}
+.generate-btn:hover {
+    transform: translateY(-2px) !important;
+    box-shadow: 0 8px 25px rgba(99,102,241,0.4) !important;
+}
+"""
+with gr.Blocks() as demo:
+    # ── Header ──
+    gr.HTML("""
+    <div class="app-header">
+      <h1>🎓 AI Study Material Generator</h1>
+      <p>Generate lesson notes, Q&amp;A, MCQs, a Mindmap, and a Cheat Sheet from any topic
+         or syllabus — using SOTA open-source LLMs running entirely on your CPU via
+         <code>transformers.pipeline</code>. No API key needed.</p>
+    </div>
+    """)
+    # ── Input Row ──
+    with gr.Row(equal_height=False):
+        # Left: syllabus input (paste OR upload)
+        with gr.Column(scale=4):
+            with gr.Tabs():
+                with gr.TabItem("✏️ Paste Text"):
+                    syllabus_input = gr.Textbox(
+                        show_label=False,
+                        placeholder=(
+                            "Paste your syllabus, topic, or any content here…\n"
+                            "e.g. The Water Cycle, Neural Networks, World War II, Photosynthesis"
+                        ),
+                        lines=7,
+                    )
+                with gr.TabItem("📂 Upload File"):
+                    gr.Markdown(
+                        "Upload a **PDF**, **Word (.docx)**, **plain text (.txt)**, "
+                        "or **image** (PNG / JPG / WEBP) — text is extracted automatically."
+                    )
+                    file_input = gr.File(
+                        label="Upload syllabus file",
+                        file_types=[".pdf", ".docx", ".doc", ".txt",
+                                    ".png", ".jpg", ".jpeg", ".webp", ".bmp"],
+                        file_count="single",
+                    )
+                    file_preview = gr.Textbox(
+                        label="Extracted text preview",
+                        lines=4,
+                        interactive=False,
+                        placeholder="Text extracted from the file will appear here…",
+                    )
+                    # Live preview when file is uploaded
+                    file_input.change(
+                        fn=lambda f: extract_text_from_file(f) if f else "",
+                        inputs=file_input,
+                        outputs=file_preview,
+                    )
+        # Right: model selector + generate button
+        with gr.Column(scale=2):
+            model_selector = gr.Dropdown(
+                choices=ALL_MODEL_NAMES,
+                value=ALL_MODEL_NAMES[0],
+                label="🤖 Model  (all run locally via pipeline)",
+                info=(
+                    "Tier 1 = fastest / least RAM. "
+                    "Tier 3 = best quality / needs 6–8 GB RAM. "
+                    "Models download on first use."
+                ),
+            )
+            gr.Markdown(
+                "<small>💡 **Llama** & **Gemma** models may require a Hugging Face login "
+                "(`huggingface-cli login`) or a Token to download.</small>"
+            )
+            hf_token_input = gr.Textbox(
+                label="🔑 HF Token (optional)",
+                info="Required for gated models. Your token stays private.",
+                type="password",
+                placeholder="hf_...",
+            )
+            generate_btn = gr.Button(
+                "⚡ Generate Study Materials",
+                variant="primary",
+                size="lg",
+                elem_classes=["generate-btn"],
+            )
+    gr.HTML("<hr style='margin:8px 0; border-color:rgba(99,102,241,0.2)'>")
+    # ── Output Tabs ──
+    with gr.Tabs():
+        with gr.TabItem("📖 Lesson Material"):
+            lesson_output = gr.Markdown(value="*Results will appear here after generation.*")
+        with gr.TabItem("❓ Q & A"):
+            qa_output = gr.Markdown(value="*Results will appear here after generation.*")
+        with gr.TabItem("✅ MCQs"):
+            mcq_output = gr.Markdown(value="*Results will appear here after generation.*")
+        with gr.TabItem("🧠 Mindmap"):
+            gr.Markdown("*The diagram is generated as an image (powered by mermaid.ink).*")
+            mindmap_output = gr.Markdown(value="*Results will appear here after generation.*")
+        with gr.TabItem("📊 Cheat Sheet"):
+            infographic_output = gr.Markdown(value="*Results will appear here after generation.*")
+    # ── Footer ──
+    gr.HTML("""
+    <div style='text-align:center; color:#64748b; font-size:0.8rem; margin-top:12px;'>
+        Built with 🤗 Gradio · Hugging Face Transformers — 100% open-source · runs offline on CPU
+    </div>
+    """)
+    # ── Wire up button ──
+    generate_btn.click(
+        fn=generate_content,
+        inputs=[syllabus_input, file_input, model_selector, hf_token_input],
+        outputs=[lesson_output, qa_output, mcq_output, mindmap_output, infographic_output],
+    )
+if __name__ == "__main__":
+    demo.launch(
+        theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
+        css=CSS,
+    )

requirements.txt CHANGED Viewed

@@ -1,4 +1,11 @@
-gradio
-transformers
-torch
-accelerate

+gradio>=4.0.0
+huggingface_hub>=0.23.0
+transformers>=4.44.0
+torch
+accelerate
+sentencepiece
+protobuf
+pymupdf
+python-docx
+pytesseract
+Pillow