drixo Cursor commited on
Commit
69abda4
·
1 Parent(s): d65c73c

Add Gradio Space app, push_to_hub, README, fix train/test paths

Browse files
Files changed (7) hide show
  1. .gitignore +3 -0
  2. README.md +69 -0
  3. app.py +113 -0
  4. push_to_hub.py +43 -0
  5. requirements.txt +5 -4
  6. test_model.py +14 -7
  7. train.py +38 -10
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ multilingual-doc-model/
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multilingual Document Assistant
2
+
3
+ Agent-style model for explaining documents, answering questions, and responding conversationally in:
4
+
5
+ - **Spanish**
6
+ - **Chinese**
7
+ - **Vietnamese**
8
+ - **Portuguese**
9
+
10
+ Base model: [bigscience/bloom-560m](https://huggingface.co/bigscience/bloom-560m) on Hugging Face.
11
+
12
+ ---
13
+
14
+ ## Run on Hugging Face
15
+
16
+ To run this as a **Hugging Face Space** (browser chat UI):
17
+
18
+ 1. **Create a Space** at [huggingface.co/new-space](https://huggingface.co/new-space):
19
+ - Choose **Gradio**.
20
+ - Clone or upload this repo (at least `app.py` and `requirements.txt`).
21
+
22
+ 2. **Use your fine-tuned model** (after training and pushing):
23
+ - Train: `python train.py`
24
+ - Push to Hub: `export HF_REPO_ID=your-username/multilingual-doc-assistant` then `python push_to_hub.py`
25
+ - In the Space, go to **Settings → Variables** and add:
26
+ - `HF_MODEL_ID` = `your-username/multilingual-doc-assistant`
27
+ - The app will load your model from the Hub. Without this, it uses the base BLOOM model.
28
+
29
+ 3. The Space runs `app.py` and serves the Gradio chat interface.
30
+
31
+ ---
32
+
33
+ ## Setup (local)
34
+
35
+ ```bash
36
+ cd multilingual-doc-assistant
37
+ pip install -r requirements.txt
38
+ ```
39
+
40
+ ## Train
41
+
42
+ ```bash
43
+ python train.py
44
+ ```
45
+
46
+ Saves the fine-tuned model and tokenizer to `./multilingual-doc-model`. You can run from any directory; paths are relative to the script.
47
+
48
+ ## Test / Chat
49
+
50
+ After training:
51
+
52
+ ```bash
53
+ python test_model.py
54
+ ```
55
+
56
+ Uses a Spanish prompt by default. You can edit the `prompt` in `test_model.py` to try other languages or questions.
57
+
58
+ ## Training data
59
+
60
+ Add more examples in `train.jsonl` (one JSON object per line with a `"text"` key). Use the same `User:` / `Assistant:` format so the model learns the conversational style.
61
+
62
+ ## Run the Space UI locally
63
+
64
+ ```bash
65
+ pip install -r requirements.txt
66
+ python app.py
67
+ ```
68
+
69
+ Then open the URL Gradio prints (e.g. http://127.0.0.1:7860). To use your trained model locally, set `HF_MODEL_ID` to a Hub repo or a local path; for a local folder use the path to `multilingual-doc-model` (transformers supports local paths).
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Space: Multilingual Document Assistant
3
+ Run this as a Gradio app on Hugging Face Spaces.
4
+ Set HF_MODEL_ID to your Hub model (e.g. your-username/multilingual-doc-assistant).
5
+ """
6
+ import os
7
+ import torch
8
+ import gradio as gr
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
10
+
11
+ # Model: Hub id (e.g. your-username/multilingual-doc-assistant) or local path.
12
+ # On Spaces set HF_MODEL_ID in Settings → Variables. Local: use trained folder if present.
13
+ _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ _LOCAL_MODEL = os.path.join(_SCRIPT_DIR, "multilingual-doc-model")
15
+ HF_MODEL_ID = os.environ.get("HF_MODEL_ID") or (_LOCAL_MODEL if os.path.isdir(_LOCAL_MODEL) else "bigscience/bloom-560m")
16
+
17
+ def load_pipeline():
18
+ tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
19
+ model = AutoModelForCausalLM.from_pretrained(HF_MODEL_ID)
20
+ if tokenizer.pad_token is None:
21
+ tokenizer.pad_token = tokenizer.eos_token
22
+ device = 0 if torch.cuda.is_available() else -1
23
+ return pipeline(
24
+ "text-generation",
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ device=device,
28
+ )
29
+
30
+ # Load once at startup (Spaces will cache)
31
+ pipe = load_pipeline()
32
+
33
+ def _get_text(content):
34
+ """Extract plain text from Gradio message content (str or list of parts)."""
35
+ if isinstance(content, str):
36
+ return content
37
+ if isinstance(content, list):
38
+ for part in content:
39
+ if isinstance(part, dict) and part.get("type") == "text":
40
+ return part.get("text", "")
41
+ if isinstance(part, str):
42
+ return part
43
+ return ""
44
+
45
+ def build_prompt(history, message):
46
+ parts = []
47
+ for turn in history:
48
+ if isinstance(turn, (list, tuple)) and len(turn) >= 2:
49
+ user_msg, assistant_msg = str(turn[0] or ""), str(turn[1] or "")
50
+ elif isinstance(turn, dict):
51
+ role = turn.get("role", "")
52
+ content = _get_text(turn.get("content", ""))
53
+ if role == "user":
54
+ user_msg, assistant_msg = content, ""
55
+ else:
56
+ user_msg, assistant_msg = "", content
57
+ if not user_msg and not assistant_msg:
58
+ continue
59
+ else:
60
+ continue
61
+ if user_msg:
62
+ parts.append(f"User: {user_msg}\nAssistant: {assistant_msg}")
63
+ parts.append(f"User: {message}\nAssistant:")
64
+ return "\n".join(parts)
65
+
66
+ def chat(message, history):
67
+ if not message.strip():
68
+ return ""
69
+ prompt = build_prompt(history, message)
70
+ out = pipe(
71
+ prompt,
72
+ max_new_tokens=150,
73
+ do_sample=True,
74
+ temperature=0.7,
75
+ pad_token_id=pipe.tokenizer.pad_token_id,
76
+ )
77
+ full = out[0]["generated_text"]
78
+ # Return only the new Assistant part (after the last "Assistant:")
79
+ if "Assistant:" in full:
80
+ reply = full.split("Assistant:")[-1].strip()
81
+ else:
82
+ reply = full[len(prompt):].strip()
83
+ # Stop at next "User:" or double newline
84
+ for stop in ["\nUser:", "\n\nUser:"]:
85
+ if stop in reply:
86
+ reply = reply.split(stop)[0].strip()
87
+ return reply
88
+
89
+ with gr.Blocks(
90
+ title="Multilingual Document Assistant",
91
+ theme=gr.themes.Soft(),
92
+ ) as demo:
93
+ gr.Markdown("""
94
+ # Multilingual Document Assistant
95
+ **Supports:** Spanish · Chinese · Vietnamese · Portuguese
96
+ Ask about documents, get explanations, or chat. *(Agent-style responses)*
97
+ """)
98
+ gr.ChatInterface(
99
+ fn=chat,
100
+ type="messages",
101
+ examples=[
102
+ ["Explícame este documento: La IA mejora la productividad."],
103
+ ["总结这段文字: 人工智能正在改变世界。"],
104
+ ["Giải thích đoạn này: Công nghệ giúp cuộc sống dễ dàng hơn."],
105
+ ],
106
+ retry_btn="Retry",
107
+ undo_btn="Undo",
108
+ clear_btn="Clear",
109
+ )
110
+ gr.Markdown(f"*Model: `{HF_MODEL_ID}`*")
111
+
112
+ if __name__ == "__main__":
113
+ demo.launch()
push_to_hub.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Push your trained model to the Hugging Face Hub so the Space can load it.
3
+ Run after train.py. Requires: pip install huggingface_hub and login (huggingface-cli login).
4
+ """
5
+ import os
6
+ from huggingface_hub import HfApi, create_repo, upload_folder
7
+
8
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
9
+ MODEL_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
10
+
11
+ def main():
12
+ if not os.path.isdir(MODEL_DIR):
13
+ print(f"Not found: {MODEL_DIR}")
14
+ print("Run train.py first to train the model.")
15
+ return
16
+
17
+ # Your Hub repo (change to your username)
18
+ repo_id = os.environ.get("HF_REPO_ID", "YOUR_USERNAME/multilingual-doc-assistant")
19
+ if "YOUR_USERNAME" in repo_id:
20
+ print("Set your Hub repo id:")
21
+ print(" export HF_REPO_ID=your-username/multilingual-doc-assistant")
22
+ print(" or edit HF_REPO_ID in this script.")
23
+ return
24
+
25
+ api = HfApi()
26
+ try:
27
+ create_repo(repo_id, exist_ok=True, repo_type="model")
28
+ except Exception as e:
29
+ print("Create repo failed (maybe need to login):", e)
30
+ print("Run: huggingface-cli login")
31
+ return
32
+
33
+ print(f"Uploading {MODEL_DIR} to https://huggingface.co/{repo_id} ...")
34
+ api.upload_folder(
35
+ folder_path=MODEL_DIR,
36
+ repo_id=repo_id,
37
+ repo_type="model",
38
+ )
39
+ print("Done. Use this model in your Space by setting:")
40
+ print(f" HF_MODEL_ID={repo_id}")
41
+
42
+ if __name__ == "__main__":
43
+ main()
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
- transformers
2
- datasets
3
- torch
4
- accelerate
5
  sentencepiece
6
  huggingface_hub
 
 
1
+ transformers>=4.36.0
2
+ datasets>=2.14.0
3
+ torch>=2.0.0
4
+ accelerate>=0.25.0
5
  sentencepiece
6
  huggingface_hub
7
+ gradio>=4.0.0
test_model.py CHANGED
@@ -1,17 +1,24 @@
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
2
 
3
- model_path = "./model"
 
 
 
 
 
 
4
 
5
  tokenizer = AutoTokenizer.from_pretrained(model_path)
6
  model = AutoModelForCausalLM.from_pretrained(model_path)
7
 
8
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
 
9
 
10
- prompt = """
11
- User: Explícame este documento:
12
  La IA mejora la productividad.
13
- Assistant:
14
- """
15
 
16
- result = pipe(prompt, max_new_tokens=120)
17
  print(result[0]["generated_text"])
 
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
2
+ import os
3
 
4
+ # Same output dir as train.py (works from any cwd)
5
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ model_path = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
7
+
8
+ if not os.path.isdir(model_path):
9
+ print(f"Model not found at {model_path}. Run train.py first to train the model.")
10
+ exit(1)
11
 
12
  tokenizer = AutoTokenizer.from_pretrained(model_path)
13
  model = AutoModelForCausalLM.from_pretrained(model_path)
14
 
15
+ # Use GPU if available, else CPU
16
+ device = 0 if __import__("torch").cuda.is_available() else -1
17
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
18
 
19
+ prompt = """User: Explícame este documento:
 
20
  La IA mejora la productividad.
21
+ Assistant:"""
 
22
 
23
+ result = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7)
24
  print(result[0]["generated_text"])
train.py CHANGED
@@ -1,40 +1,68 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
 
 
 
 
 
 
2
  from datasets import load_dataset
 
 
 
 
 
 
 
3
 
4
  model_id = "bigscience/bloom-560m"
5
 
6
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
 
7
  model = AutoModelForCausalLM.from_pretrained(model_id)
 
 
8
 
9
- dataset = load_dataset("json", data_files="train.jsonl")
10
 
11
  def tokenize(example):
12
  return tokenizer(
13
  example["text"],
14
  truncation=True,
15
- padding="max_length",
16
- max_length=512
17
  )
18
 
19
- tokenized_dataset = dataset.map(tokenize)
 
 
 
 
 
 
 
 
 
20
 
21
  training_args = TrainingArguments(
22
- output_dir="./multilingual-doc-model",
23
  per_device_train_batch_size=2,
24
  num_train_epochs=3,
25
  logging_steps=10,
26
  save_steps=500,
27
  learning_rate=2e-5,
28
- fp16=True
29
  )
30
 
31
  trainer = Trainer(
32
  model=model,
33
  args=training_args,
34
- train_dataset=tokenized_dataset["train"]
 
35
  )
36
 
37
  trainer.train()
38
 
39
- model.save_pretrained("./multilingual-doc-model")
40
- tokenizer.save_pretrained("./multilingual-doc-model")
 
1
+ from transformers import (
2
+ AutoModelForCausalLM,
3
+ AutoTokenizer,
4
+ TrainingArguments,
5
+ Trainer,
6
+ DataCollatorForLanguageModeling,
7
+ )
8
  from datasets import load_dataset
9
+ import torch
10
+ import os
11
+
12
+ # Paths relative to this script so you can run from any cwd
13
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ DATA_FILE = os.path.join(SCRIPT_DIR, "train.jsonl")
15
+ OUTPUT_DIR = os.path.join(SCRIPT_DIR, "multilingual-doc-model")
16
 
17
  model_id = "bigscience/bloom-560m"
18
 
19
  tokenizer = AutoTokenizer.from_pretrained(model_id)
20
+ # BLOOM has no pad_token by default; required for batching
21
+ if tokenizer.pad_token is None:
22
+ tokenizer.pad_token = tokenizer.eos_token
23
+
24
  model = AutoModelForCausalLM.from_pretrained(model_id)
25
+ if model.config.pad_token_id is None:
26
+ model.config.pad_token_id = tokenizer.pad_token_id
27
 
28
+ dataset = load_dataset("json", data_files={"train": DATA_FILE}, split="train")
29
 
30
  def tokenize(example):
31
  return tokenizer(
32
  example["text"],
33
  truncation=True,
34
+ max_length=512,
 
35
  )
36
 
37
+ tokenized_dataset = dataset.map(
38
+ tokenize,
39
+ remove_columns=dataset.column_names,
40
+ desc="Tokenizing",
41
+ )
42
+
43
+ data_collator = DataCollatorForLanguageModeling(
44
+ tokenizer=tokenizer,
45
+ mlm=False,
46
+ )
47
 
48
  training_args = TrainingArguments(
49
+ output_dir=OUTPUT_DIR,
50
  per_device_train_batch_size=2,
51
  num_train_epochs=3,
52
  logging_steps=10,
53
  save_steps=500,
54
  learning_rate=2e-5,
55
+ fp16=torch.cuda.is_available(),
56
  )
57
 
58
  trainer = Trainer(
59
  model=model,
60
  args=training_args,
61
+ train_dataset=tokenized_dataset,
62
+ data_collator=data_collator,
63
  )
64
 
65
  trainer.train()
66
 
67
+ model.save_pretrained(OUTPUT_DIR)
68
+ tokenizer.save_pretrained(OUTPUT_DIR)