Suguru1846 commited on
Commit
1343cce
·
verified ·
1 Parent(s): fa9e887

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -18
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import os
 
 
 
 
 
2
  # Set these environment variables before importing any libraries
3
  os.environ["TRITON_DISABLE"] = "1"
4
  os.environ["BNB_DISABLE_TRITON"] = "1"
5
  os.environ["USE_TORCH"] = "1"
6
  os.environ["BITSANDBYTES_NOWELCOME"] = "1"
7
 
8
- from fastapi import FastAPI
9
- from transformers import AutoModelForCausalLM, AutoTokenizer
10
- import torch
11
-
12
  # Set writable cache locations
13
  os.environ["HF_HOME"] = "/app/.cache/huggingface"
14
  os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/huggingface"
@@ -17,29 +18,68 @@ os.environ["TORCH_HOME"] = "/app/.cache/torch"
17
  # FastAPI app instance
18
  app = FastAPI()
19
 
20
- # Use a different model entirely - smaller and more compatible
21
- model_name = "facebook/opt-350m" # Much smaller model that should work reliably
22
-
23
- # Load tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained(
25
- model_name,
26
  cache_dir="/app/.cache/huggingface"
27
  )
28
 
29
- # Load model
30
- model = AutoModelForCausalLM.from_pretrained(
31
- model_name,
32
- torch_dtype=torch.float16,
33
- device_map="auto",
34
- cache_dir="/app/.cache/huggingface"
35
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  @app.post("/generate")
38
  async def generate_text(prompt: str, max_tokens: int = 50):
39
  """Generates text using the model."""
40
  try:
41
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
42
- outputs = model.generate(**inputs, max_new_tokens=max_tokens)
 
 
 
 
 
 
43
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
44
  return {"response": response}
45
  except Exception as e:
@@ -48,4 +88,5 @@ async def generate_text(prompt: str, max_tokens: int = 50):
48
 
49
  @app.get("/")
50
  async def root():
51
- return {"message": "AI Model is Running!"}
 
 
1
  import os
2
+ import torch
3
+ from fastapi import FastAPI
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from huggingface_hub import hf_hub_download
6
+
7
  # Set these environment variables before importing any libraries
8
  os.environ["TRITON_DISABLE"] = "1"
9
  os.environ["BNB_DISABLE_TRITON"] = "1"
10
  os.environ["USE_TORCH"] = "1"
11
  os.environ["BITSANDBYTES_NOWELCOME"] = "1"
12
 
 
 
 
 
13
  # Set writable cache locations
14
  os.environ["HF_HOME"] = "/app/.cache/huggingface"
15
  os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/huggingface"
 
18
  # FastAPI app instance
19
  app = FastAPI()
20
 
21
+ # Load the base model and tokenizer
22
+ base_model_name = "unsloth/Llama-3.2-3B-Instruct"
 
 
23
  tokenizer = AutoTokenizer.from_pretrained(
24
+ base_model_name,
25
  cache_dir="/app/.cache/huggingface"
26
  )
27
 
28
+ try:
29
+ # Load the base model first
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ base_model_name,
32
+ torch_dtype=torch.float16,
33
+ device_map="auto",
34
+ cache_dir="/app/.cache/huggingface"
35
+ )
36
+
37
+ # Now try loading and merging your adapter
38
+ try:
39
+ # Import PEFT after model loading to avoid conflicts
40
+ from peft import PeftModel, PeftConfig
41
+
42
+ adapter_name = "Suguru1846/lora_model_counseling_4bit"
43
+
44
+ # First try the standard approach
45
+ model = PeftModel.from_pretrained(
46
+ model,
47
+ adapter_name,
48
+ device_map="auto"
49
+ )
50
+ print("Successfully loaded adapter with standard method")
51
+ except Exception as adapter_error:
52
+ print(f"Standard adapter loading failed: {str(adapter_error)}")
53
+ print("Model is still running with base model only")
54
+
55
+ except Exception as model_error:
56
+ print(f"Error loading base model: {str(model_error)}")
57
+ # Fallback to a working model
58
+ model_name = "facebook/opt-350m"
59
+ model = AutoModelForCausalLM.from_pretrained(
60
+ model_name,
61
+ torch_dtype=torch.float16,
62
+ device_map="auto",
63
+ cache_dir="/app/.cache/huggingface"
64
+ )
65
+ tokenizer = AutoTokenizer.from_pretrained(
66
+ model_name,
67
+ cache_dir="/app/.cache/huggingface"
68
+ )
69
+ print("Fell back to OPT-350M model")
70
 
71
  @app.post("/generate")
72
  async def generate_text(prompt: str, max_tokens: int = 50):
73
  """Generates text using the model."""
74
  try:
75
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
76
+ outputs = model.generate(
77
+ **inputs,
78
+ max_new_tokens=max_tokens,
79
+ do_sample=True,
80
+ temperature=0.7,
81
+ top_p=0.9
82
+ )
83
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
84
  return {"response": response}
85
  except Exception as e:
 
88
 
89
  @app.get("/")
90
  async def root():
91
+ model_type = "Base Llama-3.2 with adapter" if hasattr(model, "peft_config") else "Base Llama-3.2 only"
92
+ return {"message": f"AI Model is Running! Using: {model_type}"}