Spaces:

sourize
/

DeepTalks

Sleeping

App Files Files Community

sourize commited on Apr 30, 2025

Commit

fef32cf

verified ·

1 Parent(s): 11db01a

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -41

app.py CHANGED Viewed

@@ -1,13 +1,23 @@
 import os
 import streamlit as st
-import requests
 # ── Configuration ──────────────────────────────────────────────────────────
-HF_TOKEN      = os.getenv("HF_TOKEN")  # read‐only token in Space secrets
-MODEL_ID      = "sourize/phi2-memory-lora"
-CONTEXT_TURNS = 6
-SYSTEM_PROMPT = (
     "You are a helpful assistant for DeepTalks with base Phi-2\n"
     "fine-tuned by Sourish for domain support.\n"
     "Answer **only** using the conversation context below.\n"
@@ -15,61 +25,107 @@ SYSTEM_PROMPT = (
     "If you don't know, say \"I don't know.\"\n"
 )
-API_URL = f"https://api-inference.huggingface.co/pipeline/text-generation/{MODEL_ID}"
-HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
-def query_hf(prompt: str) -> str:
-    payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": 128,
-            "do_sample": True,
-            "temperature": 0.7,
-            "top_p": 0.9,
-            "return_full_text": False
-        },
-        "options": {"use_cache": False}
-    }
-    r = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
-    r.raise_for_status()
-    data = r.json()
-    text = data[0]["generated_text"].strip()
-    for m in ("User:", "Assistant:"):
-        if m in text:
-            text = text.split(m)[0].strip()
-    return text or "I don't know."
 # ── Streamlit UI ──────────────────────────────────────────────────────────
 st.set_page_config(layout="centered")
-st.title("🧠 DeepTalks (Inference API)")
-st.subheader("Your personal AI Companion")
 if "history" not in st.session_state:
     st.session_state.history = []
-# Render chat history
-for role, msg in st.session_state.history:
-    st.chat_message("user" if role=="You" else "assistant").write(msg)
-user_input = st.chat_input("Type your message…")
 if user_input:
     st.chat_message("user").write(user_input)
     st.session_state.history.append(("You", user_input))
     recent = st.session_state.history[-CONTEXT_TURNS*2:]
     context = "\n".join(t for _, t in recent)
-    prompt = (
-        f"{SYSTEM_PROMPT}\n\n"
-        f"Context:\n{context}\n\n"
-        f"User: {user_input}\nAssistant:"
-    )
     with st.spinner("Thinking…"):
         try:
-            reply = query_hf(prompt)
         except Exception as e:
-            st.error(f"API error: {e}")
             reply = "I’m sorry, something went wrong."
     st.chat_message("assistant").write(reply)
     st.session_state.history.append(("Bot", reply))

 import os
 import streamlit as st
+import torch
+import logging
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+)
+from peft import PeftModel
 # ── Configuration ──────────────────────────────────────────────────────────
+BASE_MODEL     = "microsoft/phi-2"
+ADAPTER_REPO   = "sourize/phi2-memory-lora"
+CONTEXT_TURNS  = 7
+MAX_NEW_TOKENS = 128
+OFFLOAD_DIR    = "offload"
+SYSTEM = (
     "You are a helpful assistant for DeepTalks with base Phi-2\n"
     "fine-tuned by Sourish for domain support.\n"
     "Answer **only** using the conversation context below.\n"
     "If you don't know, say \"I don't know.\"\n"
 )
+@st.cache_resource(show_spinner=False)
+def load_pipeline():
+    # 1) Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        BASE_MODEL, trust_remote_code=True, padding_side="left"
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+    # 2) Base model: 4-bit on CUDA, plain FP16/FP32 on CPU
+    if torch.cuda.is_available():
+        quant_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype="float16",
+            low_cpu_mem_usage=True,
+        )
+        base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            trust_remote_code=True,
+            quantization_config=quant_config,
+            device_map="auto",
+            offload_folder=OFFLOAD_DIR,
+            offload_state_dict=True,
+        )
+    else:
+        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        base = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            trust_remote_code=True,
+            torch_dtype=dtype,
+            device_map="cpu",           # force CPU
+        )
+    # 3) Resize + LoRA overlay
+    base.resize_token_embeddings(len(tokenizer))
+    model = PeftModel.from_pretrained(
+        base,
+        ADAPTER_REPO,
+        trust_remote_code=True,
+        device_map="auto" if torch.cuda.is_available() else None,
+        torch_dtype=None,
+    )
+    model.eval()
+    # 4) Build generation pipeline
+    gen = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device_map="auto" if torch.cuda.is_available() else None,
+        max_new_tokens=MAX_NEW_TOKENS,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.9,
+        use_cache=True,
+        return_full_text=False,
+    )
+    logging.info("Pipeline loaded.")
+    return gen
+generator = load_pipeline()
 # ── Streamlit UI ──────────────────────────────────────────────────────────
 st.set_page_config(layout="centered")
+st.title("🧠 DeepTalks")
+st.subheader("Your personal AI Companion", divider='grey')
 if "history" not in st.session_state:
     st.session_state.history = []
+for role, text in st.session_state.history:
+    st.chat_message("user" if role == "You" else "assistant").write(text)
+user_input = st.chat_input("Your message…")
 if user_input:
     st.chat_message("user").write(user_input)
     st.session_state.history.append(("You", user_input))
     recent = st.session_state.history[-CONTEXT_TURNS*2:]
     context = "\n".join(t for _, t in recent)
+    prompt  = f"""{SYSTEM}
+Context:
+{context}
+User: {user_input}
+Assistant:"""
     with st.spinner("Thinking…"):
         try:
+            reply = generator(prompt)[0]["generated_text"].strip()
+            for marker in ["User:", "Assistant:"]:
+                if marker in reply:
+                    reply = reply.split(marker)[0].strip()
+            if not reply:
+                reply = "I’m sorry, I didn’t catch that. Could you rephrase?"
         except Exception as e:
             reply = "I’m sorry, something went wrong."
+            st.error(f"Error: {e}")
     st.chat_message("assistant").write(reply)
     st.session_state.history.append(("Bot", reply))