bsny commited on
Commit
7d876e9
·
verified ·
1 Parent(s): a517c6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -1,28 +1,31 @@
1
- from fastapi import FastAPI, Request
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
5
- import os
6
  import uuid
 
7
 
 
8
  app = FastAPI()
9
- import os; os.environ["HF_HOME"] = "/tmp/huggingface"
10
 
11
- model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
 
 
 
12
  hf_token = os.environ.get("HF_TOKEN")
13
 
 
14
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
15
  model = AutoModelForCausalLM.from_pretrained(
16
  model_id,
17
- torch_dtype=torch.float16,
18
- device_map="auto",
19
- low_cpu_mem_usage=True,
20
  token=hf_token
21
- )
22
 
23
- # Store per-session system prompts
24
  session_prompts = {}
25
 
 
26
  class SystemPrompt(BaseModel):
27
  prompt: str
28
 
@@ -44,14 +47,12 @@ def chat(message: UserMessage):
44
 
45
  full_prompt = f"<|system|>\n{system}\n<|user|>\n{message.message}\n<|assistant|>\n"
46
 
47
- inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
48
  outputs = model.generate(
49
  **inputs,
50
  max_new_tokens=200,
51
  pad_token_id=tokenizer.eos_token_id,
52
  )
53
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
-
55
- # Strip input part to isolate model's answer
56
  answer = response.replace(full_prompt.strip(), "").strip()
57
  return {"response": answer}
 
1
+ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import torch
 
5
  import uuid
6
+ import os
7
 
8
+ # FastAPI app setup
9
  app = FastAPI()
 
10
 
11
+ # Use HF cache location that's safe in HF Spaces
12
+ os.environ["HF_HOME"] = "/data/huggingface"
13
+
14
+ # Use a CPU-compatible model (non-GPTQ)
15
+ model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
16
  hf_token = os.environ.get("HF_TOKEN")
17
 
18
+ # Load model and tokenizer (no GPU-specific args)
19
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
20
  model = AutoModelForCausalLM.from_pretrained(
21
  model_id,
 
 
 
22
  token=hf_token
23
+ ).to("cpu")
24
 
25
+ # In-memory store for system prompts per session
26
  session_prompts = {}
27
 
28
+ # Request body models
29
  class SystemPrompt(BaseModel):
30
  prompt: str
31
 
 
47
 
48
  full_prompt = f"<|system|>\n{system}\n<|user|>\n{message.message}\n<|assistant|>\n"
49
 
50
+ inputs = tokenizer(full_prompt, return_tensors="pt").to("cpu")
51
  outputs = model.generate(
52
  **inputs,
53
  max_new_tokens=200,
54
  pad_token_id=tokenizer.eos_token_id,
55
  )
56
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
57
  answer = response.replace(full_prompt.strip(), "").strip()
58
  return {"response": answer}