yukee1992 commited on
Commit
469b10d
·
verified ·
1 Parent(s): d535050

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -31
app.py CHANGED
@@ -1,42 +1,29 @@
1
  import os
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
- from accelerate import init_empty_weights, load_checkpoint_and_dispatch
5
  import gradio as gr
6
 
7
- # Configuration
8
- MODEL_ID = "google/gemma-1.1-2b-it" # Using smaller 2B version
9
  HF_TOKEN = os.getenv("HF_TOKEN")
10
- MAX_TOKENS = 100 # Conservative limit
11
- OFFLOAD_FOLDER = "./offload" # Directory for disk offloading
12
 
13
  def load_model():
14
- """Proper model loading with memory optimization"""
15
- print("🔄 Loading model with memory optimization...")
16
 
17
- # Create offload directory
18
- os.makedirs(OFFLOAD_FOLDER, exist_ok=True)
19
-
20
- # Load tokenizer
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
22
-
23
- # Special loading for memory constraints
24
- with init_empty_weights():
25
- model = AutoModelForCausalLM.from_pretrained(
26
- MODEL_ID,
27
- torch_dtype=torch.float32,
28
- token=HF_TOKEN
29
- )
30
-
31
- model = load_checkpoint_and_dispatch(
32
- model,
33
- checkpoint=MODEL_ID,
34
- device_map="auto",
35
- offload_folder=OFFLOAD_FOLDER,
36
- no_split_module_classes=["GemmaDecoderLayer"]
37
  )
38
 
39
- print("✅ Model loaded with disk offloading!")
 
 
40
  return tokenizer, model
41
 
42
  tokenizer, model = load_model()
@@ -44,8 +31,8 @@ tokenizer, model = load_model()
44
  def predict(topic):
45
  """Memory-safe generation"""
46
  try:
47
- prompt = f"Create a short script about {topic}:"
48
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
49
 
50
  outputs = model.generate(
51
  **inputs,
@@ -59,10 +46,10 @@ def predict(topic):
59
  except Exception as e:
60
  return f"Error: {str(e)}"
61
 
62
- # Gradio interface
63
  gr.Interface(
64
  fn=predict,
65
  inputs=gr.Textbox(label="Topic"),
66
- outputs=gr.Textbox(label="Script", lines=5),
67
  api_name="predict"
68
  ).launch(server_name="0.0.0.0")
 
1
  import os
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
4
  import gradio as gr
5
 
6
+ # Configuration - Using even smaller model
7
+ MODEL_ID = "google/gemma-1.1-2b-it"
8
  HF_TOKEN = os.getenv("HF_TOKEN")
9
+ MAX_TOKENS = 80 # Very conservative limit
 
10
 
11
  def load_model():
12
+ """Simplified model loading that works with Spaces"""
13
+ print("🔄 Loading model (this may take a few minutes)...")
14
 
15
+ # Load with explicit CPU mapping
 
 
 
16
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ MODEL_ID,
19
+ device_map="cpu", # Force CPU-only
20
+ torch_dtype=torch.float32, # Required for CPU
21
+ token=HF_TOKEN
 
 
 
 
 
 
 
 
 
 
22
  )
23
 
24
+ # Ensure weights are tied
25
+ model.tie_weights()
26
+ print("✅ Model loaded successfully!")
27
  return tokenizer, model
28
 
29
  tokenizer, model = load_model()
 
31
  def predict(topic):
32
  """Memory-safe generation"""
33
  try:
34
+ prompt = f"Create a VERY short script about {topic}:\n1) Hook\n2) Point\n3) CTA\n\nScript:"
35
+ inputs = tokenizer(prompt, return_tensors="pt")
36
 
37
  outputs = model.generate(
38
  **inputs,
 
46
  except Exception as e:
47
  return f"Error: {str(e)}"
48
 
49
+ # Minimal interface
50
  gr.Interface(
51
  fn=predict,
52
  inputs=gr.Textbox(label="Topic"),
53
+ outputs=gr.Textbox(label="Script", lines=4),
54
  api_name="predict"
55
  ).launch(server_name="0.0.0.0")