iqasimz commited on
Commit
efc82e9
·
verified ·
1 Parent(s): 51423de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -4
app.py CHANGED
@@ -1,5 +1,131 @@
1
- import gradio as gr, os
2
- def ping(x): return f"ok: {x}"
3
- demo = gr.Interface(ping, gr.Textbox(), gr.Textbox())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  if __name__ == "__main__":
5
- demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
 
1
+ import os
2
+ import json
3
+ import warnings
4
+ import torch
5
+ import gradio as gr
6
+ import spaces
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+
9
+ # ---------- CONFIG ----------
10
+ os.environ.setdefault("GRADIO_SERVER_PORT", "7860")
11
+ MODEL_PATH = "iqasimz/g3"
12
+ MAX_NEW_TOKENS_DEFAULT = 500
13
+ TEMPERATURE_DEFAULT = 0.6
14
+ TOP_P_DEFAULT = 0.95
15
+ # ---------------------------
16
+
17
+ warnings.filterwarnings("ignore", module="torch")
18
+ _model_cache = {}
19
+
20
+ def _ensure_pad_token(tokenizer):
21
+ if tokenizer.pad_token is None:
22
+ tokenizer.pad_token = tokenizer.eos_token
23
+ return tokenizer
24
+
25
+ def load_model_to_cpu(model_dir: str):
26
+ """Load tokenizer+model once on CPU; moved to GPU per request via @spaces.GPU."""
27
+ if model_dir in _model_cache:
28
+ return _model_cache[model_dir]
29
+
30
+ tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
31
+ tok = _ensure_pad_token(tok)
32
+
33
+ mdl = AutoModelForCausalLM.from_pretrained(
34
+ model_dir,
35
+ trust_remote_code=True,
36
+ torch_dtype=torch.float16, # model runs in fp16 when moved to GPU
37
+ device_map=None, # keep on CPU for caching
38
+ )
39
+ mdl.eval()
40
+ _model_cache[model_dir] = (tok, mdl)
41
+ print(f"[cache] Loaded {model_dir} on CPU")
42
+ return tok, mdl
43
+
44
+ @spaces.GPU(duration=120)
45
+ def generate_text(input_text, max_tokens):
46
+ if not input_text.strip():
47
+ return "Please enter some text."
48
+
49
+ tokenizer, model = load_model_to_cpu(MODEL_PATH)
50
+ model = model.to("cuda")
51
+
52
+ # Format with Qwen3 chat template
53
+ messages = [{"role": "user", "content": input_text}]
54
+ formatted_text = tokenizer.apply_chat_template(
55
+ messages,
56
+ tokenize=False,
57
+ add_generation_prompt=True
58
+ )
59
+
60
+ # Tokenize
61
+ inputs = tokenizer(formatted_text, return_tensors="pt").to(model.device)
62
+
63
+ # Generate
64
+ with torch.inference_mode():
65
+ outputs = model.generate(
66
+ **inputs,
67
+ max_new_tokens=int(max_tokens),
68
+ temperature=TEMPERATURE_DEFAULT,
69
+ top_p=TOP_P_DEFAULT,
70
+ do_sample=True,
71
+ pad_token_id=tokenizer.eos_token_id,
72
+ eos_token_id=tokenizer.eos_token_id,
73
+ use_cache=True,
74
+ )
75
+
76
+ # Decode full response
77
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
78
+
79
+ # Extract assistant response (handle Qwen3 format)
80
+ if "<|Assistant|>" in full_response:
81
+ response = full_response.split("<|Assistant|>")[-1]
82
+ response = response.split("<|end▁of▁sentence|>")[0].strip()
83
+ else:
84
+ # Fallback: decode only new tokens
85
+ new_tokens = outputs[0][inputs.input_ids.shape[-1]:]
86
+ response = tokenizer.decode(new_tokens, skip_special_tokens=True)
87
+
88
+ return response
89
+
90
+ def launch_app():
91
+ with gr.Blocks(title="iqasimz/g3 - Raw Output") as demo:
92
+ gr.Markdown("# iqasimz/g3 Model")
93
+ gr.Markdown("Enter text and get raw model output")
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ input_text = gr.Textbox(
98
+ label="Input Text",
99
+ lines=8,
100
+ placeholder="Enter your text here..."
101
+ )
102
+ max_tokens = gr.Slider(
103
+ minimum=50,
104
+ maximum=5000,
105
+ value=MAX_NEW_TOKENS_DEFAULT,
106
+ step=50,
107
+ label="Max New Tokens"
108
+ )
109
+ generate_btn = gr.Button("Generate", variant="primary")
110
+
111
+ with gr.Column():
112
+ output_text = gr.Textbox(
113
+ label="Model Output",
114
+ lines=15,
115
+ show_copy_button=True
116
+ )
117
+
118
+ generate_btn.click(
119
+ fn=generate_text,
120
+ inputs=[input_text, max_tokens],
121
+ outputs=output_text
122
+ )
123
+
124
+ gr.Markdown("### Model Info")
125
+ gr.Markdown(f"- Model: {MODEL_PATH}\n- Temperature: {TEMPERATURE_DEFAULT}\n- Top-p: {TOP_P_DEFAULT}")
126
+
127
+ return demo
128
+
129
  if __name__ == "__main__":
130
+ app = launch_app()
131
+ app.launch(share=True)