Add Gradio Space app, push_to_hub, README, fix train/test paths

Browse files

Files changed (7) hide show

.gitignore +3 -0
README.md +69 -0
app.py +113 -0
push_to_hub.py +43 -0
requirements.txt +5 -4
test_model.py +14 -7
train.py +38 -10

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+*.pyc
+multilingual-doc-model/

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Multilingual Document Assistant
+Agent-style model for explaining documents, answering questions, and responding conversationally in:
+- **Spanish**
+- **Chinese**
+- **Vietnamese**
+- **Portuguese**
+Base model: [bigscience/bloom-560m](https://huggingface.co/bigscience/bloom-560m) on Hugging Face.
+---
+## Run on Hugging Face
+To run this as a **Hugging Face Space** (browser chat UI):
+1. **Create a Space** at [huggingface.co/new-space](https://huggingface.co/new-space):
+   - Choose **Gradio**.
+   - Clone or upload this repo (at least `app.py` and `requirements.txt`).
+2. **Use your fine-tuned model** (after training and pushing):
+   - Train: `python train.py`
+   - Push to Hub: `export HF_REPO_ID=your-username/multilingual-doc-assistant` then `python push_to_hub.py`
+   - In the Space, go to **Settings → Variables** and add:
+     - `HF_MODEL_ID` = `your-username/multilingual-doc-assistant`
+   - The app will load your model from the Hub. Without this, it uses the base BLOOM model.
+3. The Space runs `app.py` and serves the Gradio chat interface.
+---
+## Setup (local)
+```bash
+cd multilingual-doc-assistant
+pip install -r requirements.txt
+```
+## Train
+```bash
+python train.py
+```
+Saves the fine-tuned model and tokenizer to `./multilingual-doc-model`. You can run from any directory; paths are relative to the script.
+## Test / Chat
+After training:
+```bash
+python test_model.py
+```
+Uses a Spanish prompt by default. You can edit the `prompt` in `test_model.py` to try other languages or questions.
+## Training data
+Add more examples in `train.jsonl` (one JSON object per line with a `"text"` key). Use the same `User:` / `Assistant:` format so the model learns the conversational style.
+## Run the Space UI locally
+```bash
+pip install -r requirements.txt
+python app.py
+```
+Then open the URL Gradio prints (e.g. http://127.0.0.1:7860). To use your trained model locally, set `HF_MODEL_ID` to a Hub repo or a local path; for a local folder use the path to `multilingual-doc-model` (transformers supports local paths).

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Hugging Face Space: Multilingual Document Assistant
+Run this as a Gradio app on Hugging Face Spaces.
+Set HF_MODEL_ID to your Hub model (e.g. your-username/multilingual-doc-assistant).
+"""
+import os
+import torch
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# Model: Hub id (e.g. your-username/multilingual-doc-assistant) or local path.
+# On Spaces set HF_MODEL_ID in Settings → Variables. Local: use trained folder if present.
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_LOCAL_MODEL = os.path.join(_SCRIPT_DIR, "multilingual-doc-model")
+HF_MODEL_ID = os.environ.get("HF_MODEL_ID") or (_LOCAL_MODEL if os.path.isdir(_LOCAL_MODEL) else "bigscience/bloom-560m")
+def load_pipeline():
+    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(HF_MODEL_ID)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    device = 0 if torch.cuda.is_available() else -1
+    return pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+    )
+# Load once at startup (Spaces will cache)
+pipe = load_pipeline()
+def _get_text(content):
+    """Extract plain text from Gradio message content (str or list of parts)."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "text":
+                return part.get("text", "")
+            if isinstance(part, str):
+                return part
+    return ""
+def build_prompt(history, message):
+    parts = []
+    for turn in history:
+        if isinstance(turn, (list, tuple)) and len(turn) >= 2:
+            user_msg, assistant_msg = str(turn[0] or ""), str(turn[1] or "")
+        elif isinstance(turn, dict):
+            role = turn.get("role", "")
+            content = _get_text(turn.get("content", ""))
+            if role == "user":
+                user_msg, assistant_msg = content, ""
+            else:
+                user_msg, assistant_msg = "", content
+            if not user_msg and not assistant_msg:
+                continue
+        else:
+            continue
+        if user_msg:
+            parts.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
+    parts.append(f"User: {message}\nAssistant:")
+    return "\n".join(parts)
+def chat(message, history):
+    if not message.strip():
+        return ""
+    prompt = build_prompt(history, message)
+    out = pipe(
+        prompt,
+        max_new_tokens=150,
+        do_sample=True,
+        temperature=0.7,
+        pad_token_id=pipe.tokenizer.pad_token_id,
+    )
+    full = out[0]["generated_text"]
+    # Return only the new Assistant part (after the last "Assistant:")
+    if "Assistant:" in full:
+        reply = full.split("Assistant:")[-1].strip()
+    else:
+        reply = full[len(prompt):].strip()
+    # Stop at next "User:" or double newline
+    for stop in ["\nUser:", "\n\nUser:"]:
+        if stop in reply:
+            reply = reply.split(stop)[0].strip()
+    return reply
+with gr.Blocks(
+    title="Multilingual Document Assistant",
+    theme=gr.themes.Soft(),
+) as demo:
+    gr.Markdown("""
+    # Multilingual Document Assistant
+    **Supports:** Spanish · Chinese · Vietnamese · Portuguese
+    Ask about documents, get explanations, or chat. *(Agent-style responses)*
+    """)
+    gr.ChatInterface(
+        fn=chat,
+        type="messages",
+        examples=[
+            ["Explícame este documento: La IA mejora la productividad."],
+            ["总结这段文字: 人工智能正在改变世界。"],
+            ["Giải thích đoạn này: Công nghệ giúp cuộc sống dễ dàng hơn."],
+        ],
+        retry_btn="Retry",
+        undo_btn="Undo",
+        clear_btn="Clear",
+    )
+    gr.Markdown(f"*Model: `{HF_MODEL_ID}`*")
+if __name__ == "__main__":
+    demo.launch()

push_to_hub.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Push your trained model to the Hugging Face Hub so the Space can load it.
+Run after train.py. Requires: pip install huggingface_hub and login (huggingface-cli login).
+"""
+import os
+from huggingface_hub import HfApi, create_repo, upload_folder
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
+def main():
+    if not os.path.isdir(MODEL_DIR):
+        print(f"Not found: {MODEL_DIR}")
+        print("Run train.py first to train the model.")
+        return
+    # Your Hub repo (change to your username)
+    repo_id = os.environ.get("HF_REPO_ID", "YOUR_USERNAME/multilingual-doc-assistant")
+    if "YOUR_USERNAME" in repo_id:
+        print("Set your Hub repo id:")
+        print("  export HF_REPO_ID=your-username/multilingual-doc-assistant")
+        print("  or edit HF_REPO_ID in this script.")
+        return
+    api = HfApi()
+    try:
+        create_repo(repo_id, exist_ok=True, repo_type="model")
+    except Exception as e:
+        print("Create repo failed (maybe need to login):", e)
+        print("Run: huggingface-cli login")
+        return
+    print(f"Uploading {MODEL_DIR} to https://huggingface.co/{repo_id} ...")
+    api.upload_folder(
+        folder_path=MODEL_DIR,
+        repo_id=repo_id,
+        repo_type="model",
+    )
+    print("Done. Use this model in your Space by setting:")
+    print(f"  HF_MODEL_ID={repo_id}")
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-transformers
-datasets
-torch
-accelerate
 sentencepiece
 huggingface_hub

+transformers>=4.36.0
+datasets>=2.14.0
+torch>=2.0.0
+accelerate>=0.25.0
 sentencepiece
 huggingface_hub
+gradio>=4.0.0

test_model.py CHANGED Viewed

@@ -1,17 +1,24 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-model_path = "./model"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(model_path)
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-prompt = """
-User: Explícame este documento:
 La IA mejora la productividad.
-Assistant:
-"""
-result = pipe(prompt, max_new_tokens=120)
 print(result[0]["generated_text"])

 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import os
+# Same output dir as train.py (works from any cwd)
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+model_path = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
+if not os.path.isdir(model_path):
+    print(f"Model not found at {model_path}. Run train.py first to train the model.")
+    exit(1)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(model_path)
+# Use GPU if available, else CPU
+device = 0 if __import__("torch").cuda.is_available() else -1
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
+prompt = """User: Explícame este documento:
 La IA mejora la productividad.
+Assistant:"""
+result = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7)
 print(result[0]["generated_text"])

train.py CHANGED Viewed

@@ -1,40 +1,68 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
 from datasets import load_dataset
 model_id = "bigscience/bloom-560m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id)
-dataset = load_dataset("json", data_files="train.jsonl")
 def tokenize(example):
     return tokenizer(
         example["text"],
         truncation=True,
-        padding="max_length",
-        max_length=512
     )
-tokenized_dataset = dataset.map(tokenize)
 training_args = TrainingArguments(
-    output_dir="./multilingual-doc-model",
     per_device_train_batch_size=2,
     num_train_epochs=3,
     logging_steps=10,
     save_steps=500,
     learning_rate=2e-5,
-    fp16=True
 )
 trainer = Trainer(
     model=model,
     args=training_args,
-    train_dataset=tokenized_dataset["train"]
 )
 trainer.train()
-model.save_pretrained("./multilingual-doc-model")
-tokenizer.save_pretrained("./multilingual-doc-model")

+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+)
 from datasets import load_dataset
+import torch
+import os
+# Paths relative to this script so you can run from any cwd
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_FILE = os.path.join(SCRIPT_DIR, "train.jsonl")
+OUTPUT_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
 model_id = "bigscience/bloom-560m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# BLOOM has no pad_token by default; required for batching
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(model_id)
+if model.config.pad_token_id is None:
+    model.config.pad_token_id = tokenizer.pad_token_id
+dataset = load_dataset("json", data_files={"train": DATA_FILE}, split="train")
 def tokenize(example):
     return tokenizer(
         example["text"],
         truncation=True,
+        max_length=512,
     )
+tokenized_dataset = dataset.map(
+    tokenize,
+    remove_columns=dataset.column_names,
+    desc="Tokenizing",
+)
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=False,
+)
 training_args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
     per_device_train_batch_size=2,
     num_train_epochs=3,
     logging_steps=10,
     save_steps=500,
     learning_rate=2e-5,
+    fp16=torch.cuda.is_available(),
 )
 trainer = Trainer(
     model=model,
     args=training_args,
+    train_dataset=tokenized_dataset,
+    data_collator=data_collator,
 )
 trainer.train()
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)