Spaces:

staeiou
/

bartleby-examples

Sleeping

App Files Files Community

staeiou commited on Jan 1

Commit

28ddeb8

verified ·

1 Parent(s): e906781

Create app.py

Browse files

Files changed (1) hide show

app.py +314 -0

app.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# app.py
+# Hugging Face Spaces (Gradio) app that:
+# 1) Loads a Transformers CausalLM from a Hub repo (prefers .safetensors)
+# 2) Runs a fixed list of prompts one-by-one (WITHOUT the "Q:" prefix)
+# 3) Saves the Q/A pairs into examples.md in the requested format
+#
+# Configure via Space Variables/Secrets (recommended):
+# - MODEL_REPO_ID:   e.g. "username/my-model-repo"
+# - REVISION:        optional (branch/tag/commit)
+# - HF_TOKEN:        optional if repo is private
+# - MAX_NEW_TOKENS:  optional (default 128)
+#
+# Notes:
+# - This expects the repo to be Transformers-compatible (config/tokenizer present).
+# - If your repo has multiple weight shards, Transformers will pick them up automatically.
+# - The generated examples.md is written to the Space's local filesystem and offered for download.
+import os
+import time
+from dataclasses import dataclass
+from typing import List, Tuple, Optional
+import torch
+import gradio as gr
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+# -----------------------------
+# Prompts (sent WITHOUT "Q:")
+# -----------------------------
+RAW_PROMPTS: List[str] = [
+    "What is the capital of France?",
+    "Calculate 2+2",
+    "chocolate cake recipe",
+    "What model are you?",
+    "a;lkj2l1;j2r';13",
+    "¿Cuántos libros había en la Biblioteca de Alejandría?",
+    "How many books were in the library of Alexandria?",
+    "Te amo, mi amor. ¿Me amas? ¿Soy tu amor?",
+    "My love, I love you. Do you love me? Am I your love?",
+    "اردو بولنے والے کے طور پر کام کریں۔",
+    "Act as an Urdu speaker.",
+]
+@dataclass
+class LoadSettings:
+    repo_id: str
+    revision: Optional[str] = None
+    hf_token: Optional[str] = None
+    torch_dtype: Optional[torch.dtype] = None
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+def _env_int(name: str, default: int) -> int:
+    try:
+        return int(os.getenv(name, default))
+    except Exception:
+        return default
+MAX_NEW_TOKENS_DEFAULT = _env_int("MAX_NEW_TOKENS", 128)
+# -----------------------------
+# Model loading
+# -----------------------------
+def load_model_and_tokenizer(settings: LoadSettings):
+    if not settings.repo_id or settings.repo_id.strip() == "":
+        raise ValueError("MODEL_REPO_ID is empty. Set it in Space variables or type it in the UI.")
+    # Download repo snapshot locally (fast subsequent runs due to caching)
+    local_dir = snapshot_download(
+        repo_id=settings.repo_id,
+        revision=settings.revision,
+        token=settings.hf_token,
+        local_dir=None,
+        local_dir_use_symlinks=False,
+    )
+    # Try to pick an appropriate dtype
+    if settings.torch_dtype is None:
+        if torch.cuda.is_available():
+            # bfloat16 is great on modern GPUs; fall back to float16 otherwise
+            settings.torch_dtype = torch.bfloat16 if torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16
+        else:
+            settings.torch_dtype = torch.float32
+    # Load tokenizer/config
+    config = AutoConfig.from_pretrained(local_dir)
+    tokenizer = AutoTokenizer.from_pretrained(local_dir, use_fast=True)
+    # Ensure pad token exists for generation if needed
+    if tokenizer.pad_token is None:
+        # Common safe fallback for causal LMs
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load model (Transformers will prefer safetensors if present)
+    # device_map="auto" works well on GPU; on CPU it can be omitted.
+    if torch.cuda.is_available():
+        model = AutoModelForCausalLM.from_pretrained(
+            local_dir,
+            config=config,
+            torch_dtype=settings.torch_dtype,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            local_dir,
+            config=config,
+            torch_dtype=settings.torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        ).to(settings.device)
+    model.eval()
+    return model, tokenizer, local_dir
+# -----------------------------
+# Prompt formatting + generation
+# -----------------------------
+def build_inputs(tokenizer, prompt: str, device: str):
+    # If the tokenizer supports a chat template, use it.
+    if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+        messages = [{"role": "user", "content": prompt}]
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt",
+        )
+        return input_ids.to(device)
+    # Plain text
+    enc = tokenizer(prompt, return_tensors="pt")
+    return enc["input_ids"].to(device)
+@torch.inference_mode()
+def generate_one(
+    model,
+    tokenizer,
+    prompt: str,
+    max_new_tokens: int = 128,
+    temperature: float = 0.0,
+) -> str:
+    device = next(model.parameters()).device
+    input_ids = build_inputs(tokenizer, prompt, device)
+    # Deterministic by default: do_sample=False when temperature == 0
+    do_sample = temperature is not None and temperature > 0
+    outputs = model.generate(
+        input_ids=input_ids,
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=temperature if do_sample else None,
+        top_p=0.95 if do_sample else None,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    # Decode only the newly generated tokens (cleanest "answer")
+    gen_ids = outputs[0, input_ids.shape[-1] :]
+    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
+    return text.strip()
+def format_examples_md(pairs: List[Tuple[str, str]]) -> str:
+    blocks = []
+    for q, a in pairs:
+        blocks.append(f"- Q: {q}\n- A: {a}".strip())
+    return "\n\n".join(blocks) + "\n"
+# -----------------------------
+# Gradio app logic
+# -----------------------------
+MODEL = None
+TOKENIZER = None
+MODEL_LOCAL_DIR = None
+def do_load(repo_id: str, revision: str, hf_token: str, max_new_tokens: int):
+    global MODEL, TOKENIZER, MODEL_LOCAL_DIR
+    repo_id = (repo_id or "").strip()
+    revision = (revision or "").strip() or None
+    hf_token = (hf_token or "").strip() or os.getenv("HF_TOKEN") or None
+    settings = LoadSettings(repo_id=repo_id, revision=revision, hf_token=hf_token)
+    MODEL, TOKENIZER, MODEL_LOCAL_DIR = load_model_and_tokenizer(settings)
+    info = [
+        f"Loaded repo: `{repo_id}`",
+        f"Revision: `{revision or 'default'}`",
+        f"Local snapshot dir: `{MODEL_LOCAL_DIR}`",
+        f"Device: `{next(MODEL.parameters()).device}`",
+        f"Default max_new_tokens: `{max_new_tokens}`",
+    ]
+    return "\n".join(info)
+def generate_examples(max_new_tokens: int, temperature: float):
+    if MODEL is None or TOKENIZER is None:
+        raise RuntimeError("Model not loaded. Click 'Load model' first (or set MODEL_REPO_ID and restart).")
+    pairs = []
+    for p in RAW_PROMPTS:
+        ans = generate_one(
+            MODEL,
+            TOKENIZER,
+            p,  # sent WITHOUT "Q:"
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+        )
+        # Keep answers single-line-ish for markdown readability (optional)
+        ans_clean = " ".join(ans.splitlines()).strip()
+        pairs.append((p, ans_clean))
+    md = format_examples_md(pairs)
+    # Write examples.md
+    out_path = os.path.abspath("examples.md")
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(md)
+    return md, out_path
+def maybe_autoload():
+    """If MODEL_REPO_ID is set, load automatically on startup."""
+    repo_id = (os.getenv("MODEL_REPO_ID") or "").strip()
+    if not repo_id:
+        return "MODEL_REPO_ID not set. Enter a repo id and click 'Load model'."
+    revision = (os.getenv("REVISION") or "").strip() or None
+    hf_token = (os.getenv("HF_TOKEN") or "").strip() or None
+    max_new_tokens = _env_int("MAX_NEW_TOKENS", MAX_NEW_TOKENS_DEFAULT)
+    try:
+        return do_load(repo_id, revision or "", hf_token or "", max_new_tokens)
+    except Exception as e:
+        return f"Autoload failed: {type(e).__name__}: {e}"
+with gr.Blocks(title="Safetensors QA -> examples.md") as demo:
+    gr.Markdown(
+        """
+# Safetensors QA → `examples.md`
+This Space loads a Transformers model (preferring `.safetensors`) from a Hub repo and generates answers for a fixed list of prompts (sent **without** the `Q:` prefix).
+Then it writes the results into `examples.md` in the requested `- Q:` / `- A:` format.
+"""
+    )
+    with gr.Accordion("Model settings", open=True):
+        repo_id_in = gr.Textbox(
+            label="MODEL_REPO_ID (Hub repo)",
+            value=os.getenv("MODEL_REPO_ID", ""),
+            placeholder='e.g. "username/my-model-repo"',
+        )
+        revision_in = gr.Textbox(
+            label="Revision (optional)",
+            value=os.getenv("REVISION", ""),
+            placeholder="branch / tag / commit (leave empty for default)",
+        )
+        token_in = gr.Textbox(
+            label="HF_TOKEN (optional, for private repos)",
+            value="",
+            placeholder="Leave empty to use Space secret HF_TOKEN",
+            type="password",
+        )
+        load_btn = gr.Button("Load model", variant="primary")
+        load_status = gr.Markdown(value=maybe_autoload())
+    with gr.Accordion("Generation settings", open=True):
+        max_new_tokens_in = gr.Slider(
+            label="max_new_tokens",
+            minimum=16,
+            maximum=1024,
+            value=_env_int("MAX_NEW_TOKENS", MAX_NEW_TOKENS_DEFAULT),
+            step=1,
+        )
+        temperature_in = gr.Slider(
+            label="temperature (0 = deterministic)",
+            minimum=0.0,
+            maximum=2.0,
+            value=0.0,
+            step=0.05,
+        )
+    gr.Markdown("## Generate `examples.md`")
+    gen_btn = gr.Button("Run prompts and write examples.md", variant="secondary")
+    md_preview = gr.Markdown(label="Preview")
+    md_file = gr.File(label="Download examples.md")
+    load_btn.click(
+        fn=do_load,
+        inputs=[repo_id_in, revision_in, token_in, max_new_tokens_in],
+        outputs=[load_status],
+    )
+    gen_btn.click(
+        fn=generate_examples,
+        inputs=[max_new_tokens_in, temperature_in],
+        outputs=[md_preview, md_file],
+    )
+if __name__ == "__main__":
+    demo.launch()