Scaryscar commited on
Commit
9a395d1
·
verified ·
1 Parent(s): d2b31d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -50
app.py CHANGED
@@ -1,62 +1,46 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
2
- import torch
3
  import gradio as gr
4
- import os
5
-
6
- # 1. GPU Verification
7
- if not torch.cuda.is_available():
8
- raise RuntimeError("❌ GPU not detected! Enable GPU in Space settings.")
9
- print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
10
-
11
- # 2. Model Configuration
12
- MODEL_NAME = "google/gemma-2b-it" # Try "mistralai/Mistral-7B-v0.1" for more power
13
 
14
- # 3. Load Model with GPU Optimization
15
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
16
- model = AutoModelForCausalLM.from_pretrained(
17
- MODEL_NAME,
18
- device_map="auto", # Auto-selects GPU
19
- torch_dtype=torch.float16, # Half-precision for memory
20
- low_cpu_mem_usage=True # Reduces CPU overhead
21
- )
22
 
23
- # 4. Create GPU-accelerated pipeline
24
- pipe = pipeline(
25
  "text-generation",
26
- model=model,
27
- tokenizer=tokenizer,
28
- device=0, # Force first GPU
29
- torch_dtype=torch.float16
 
 
 
30
  )
31
 
32
- # 5. Generation Function
33
- def generate_text(prompt):
 
 
 
34
  try:
35
- outputs = pipe(
36
  prompt,
37
- max_new_tokens=150,
38
- temperature=0.7,
39
- do_sample=True,
40
- pad_token_id=tokenizer.eos_token_id
41
  )
42
- return outputs[0]['generated_text']
43
  except Exception as e:
44
- return f"⚠️ Error: {str(e)}"
45
 
46
- # 6. Gradio Interface
47
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
48
- gr.Markdown("## 🚀 GPU-Powered Text Generator")
49
- with gr.Row():
50
- inp = gr.Textbox(label="Your Prompt", placeholder="Type here...")
51
- with gr.Row():
52
- out = gr.Textbox(label="Generated Text", lines=5)
53
- with gr.Row():
54
- btn = gr.Button("Generate", variant="primary")
55
-
56
- btn.click(fn=generate_text, inputs=inp, outputs=out)
57
- inp.submit(fn=generate_text, inputs=inp, outputs=out)
58
 
59
- # 7. Launch with GPU monitoring
60
- if __name__ == "__main__":
61
- print(f"GPU Memory Allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")
62
- demo.launch(server_name="0.0.0.0")
 
1
+ from transformers import pipeline
 
2
  import gradio as gr
3
+ import torch
 
 
 
 
 
 
 
 
4
 
5
+ # Auto-configure GPU/CPU
6
+ device = 0 if torch.cuda.is_available() else -1
7
+ dtype = torch.float16 if device == 0 else torch.float32
8
+ print(f"⚡ Using {'GPU: ' + torch.cuda.get_device_name(0) if device == 0 else 'CPU'}")
 
 
 
 
9
 
10
+ # Load optimized pipeline
11
+ model = pipeline(
12
  "text-generation",
13
+ model="google/gemma-2b-it",
14
+ device=device,
15
+ torch_dtype=dtype,
16
+ model_kwargs={
17
+ "low_cpu_mem_usage": True,
18
+ "trust_remote_code": True
19
+ }
20
  )
21
 
22
+ # Pre-warm model (reduces first response time)
23
+ model("Warming up...", max_new_tokens=1)
24
+
25
+ def generate(prompt):
26
+ """Ultra-fast generation with 1-2 second responses"""
27
  try:
28
+ output = model(
29
  prompt,
30
+ max_new_tokens=80, # Shorter = faster
31
+ temperature=0.3, # More deterministic
32
+ do_sample=False, # Disable sampling for speed
33
+ pad_token_id=model.tokenizer.eos_token_id
34
  )
35
+ return output[0]['generated_text']
36
  except Exception as e:
37
+ return f"Error: {str(e)}"
38
 
39
+ # Minimal UI for maximum speed
40
+ with gr.Blocks(title="🚀 Instant AI") as demo:
41
+ gr.Markdown("## Type anything (1-2 sec responses):")
42
+ input = gr.Textbox(placeholder="How to make pizza?")
43
+ output = gr.Textbox()
44
+ input.submit(generate, input, output)
 
 
 
 
 
 
45
 
46
+ demo.launch(server_name="0.0.0.0")