yukee1992 commited on
Commit
7fbfabe
·
verified ·
1 Parent(s): 1d59763

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -3,26 +3,25 @@ import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
5
 
6
- # Configuration - Using even smaller model
7
- MODEL_ID = "google/gemma-1.1-2b-it"
8
  HF_TOKEN = os.getenv("HF_TOKEN")
9
- MAX_TOKENS = 80 # Very conservative limit
10
 
11
  def load_model():
12
- """Simplified model loading that works with Spaces"""
13
- print("🔄 Loading model (this may take a few minutes)...")
14
 
15
- # Load with explicit CPU mapping
16
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
 
 
17
  model = AutoModelForCausalLM.from_pretrained(
18
  MODEL_ID,
19
- device_map="cpu", # Force CPU-only
20
  torch_dtype=torch.float32, # Required for CPU
21
  token=HF_TOKEN
22
- )
23
 
24
- # Ensure weights are tied
25
- model.tie_weights()
26
  print("✅ Model loaded successfully!")
27
  return tokenizer, model
28
 
@@ -31,14 +30,13 @@ tokenizer, model = load_model()
31
  def predict(topic):
32
  """Memory-safe generation"""
33
  try:
34
- prompt = f"Create a VERY short script about {topic}:\n1) Hook\n2) Point\n3) CTA\n\nScript:"
35
- inputs = tokenizer(prompt, return_tensors="pt")
36
 
37
  outputs = model.generate(
38
  **inputs,
39
  max_new_tokens=MAX_TOKENS,
40
- temperature=0.7,
41
- do_sample=True
42
  )
43
 
44
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
5
 
6
+ # Configuration
7
+ MODEL_ID = "google/gemma-1.1-2b-it" # Using smaller 2B version
8
  HF_TOKEN = os.getenv("HF_TOKEN")
9
+ MAX_TOKENS = 80 # Conservative limit
10
 
11
  def load_model():
12
+ """Simplified model loading that works in Spaces"""
13
+ print("🔄 Loading model...")
14
 
15
+ # Load tokenizer
16
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
17
+
18
+ # Explicit CPU-only loading
19
  model = AutoModelForCausalLM.from_pretrained(
20
  MODEL_ID,
 
21
  torch_dtype=torch.float32, # Required for CPU
22
  token=HF_TOKEN
23
+ ).to('cpu') # Explicit CPU placement
24
 
 
 
25
  print("✅ Model loaded successfully!")
26
  return tokenizer, model
27
 
 
30
  def predict(topic):
31
  """Memory-safe generation"""
32
  try:
33
+ prompt = f"Create a short script about {topic}:\n1) Hook\n2) Point\n3) CTA\n\nScript:"
34
+ inputs = tokenizer(prompt, return_tensors="pt").to('cpu')
35
 
36
  outputs = model.generate(
37
  **inputs,
38
  max_new_tokens=MAX_TOKENS,
39
+ temperature=0.7
 
40
  )
41
 
42
  return tokenizer.decode(outputs[0], skip_special_tokens=True)