Spaces:

sourize
/

DeepTalks

Sleeping

App Files Files Community

sourize commited on Apr 30, 2025

Commit

df4e3a8

verified ·

1 Parent(s): 17d9700

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -101

app.py CHANGED Viewed

@@ -1,131 +1,76 @@
 import os
 import streamlit as st
-import torch
-import logging
-from transformers import (
-    pipeline,
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    BitsAndBytesConfig,
-)
-from peft import PeftModel
 # ── Configuration ──────────────────────────────────────────────────────────
-BASE_MODEL     = "microsoft/phi-2"
-ADAPTER_REPO   = "sourize/phi2-memory-lora"
-CONTEXT_TURNS  = 6
-MAX_NEW_TOKENS = 128
-OFFLOAD_DIR    = "offload"
-SYSTEM = (
-    "You are a helpful assistant for DeepTalks with base Phi-2\n"
-    "fine-tuned by Sourish for domain support.\n"
     "Answer **only** using the conversation context below.\n"
     "Do NOT output any lines beginning with 'User:' or 'Assistant:'.\n"
     "If you don't know, say \"I don't know.\"\n"
 )
-@st.cache_resource(show_spinner=False)
-def load_pipeline():
-    # 1) Tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        BASE_MODEL, trust_remote_code=True, padding_side="left"
-    )
-    if tokenizer.pad_token_id is None:
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-    # 2) Base model: 4-bit on CUDA, plain FP16/FP32 on CPU
-    if torch.cuda.is_available():
-        quant_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype="float16",
-            low_cpu_mem_usage=True,
-        )
-        base = AutoModelForCausalLM.from_pretrained(
-            BASE_MODEL,
-            trust_remote_code=True,
-            quantization_config=quant_config,
-            device_map="auto",
-            offload_folder=OFFLOAD_DIR,
-            offload_state_dict=True,
-        )
-    else:
-        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-        base = AutoModelForCausalLM.from_pretrained(
-            BASE_MODEL,
-            trust_remote_code=True,
-            torch_dtype=dtype,
-            device_map="cpu",           # force CPU
-        )
-    # 3) Resize + LoRA overlay
-    base.resize_token_embeddings(len(tokenizer))
-    model = PeftModel.from_pretrained(
-        base,
-        ADAPTER_REPO,
-        trust_remote_code=True,
-        device_map="auto" if torch.cuda.is_available() else None,
-        torch_dtype=None,
-    )
-    model.eval()
-    # 4) Build generation pipeline
-    gen = pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        device_map="auto" if torch.cuda.is_available() else None,
-        max_new_tokens=MAX_NEW_TOKENS,
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-        use_cache=True,
-        return_full_text=False,
     )
-    logging.info("Pipeline loaded.")
-    return gen
-generator = load_pipeline()
 # ── Streamlit UI ──────────────────────────────────────────────────────────
 st.set_page_config(layout="centered")
-st.title("🧠 DeepTalks")
-st.subheader("Your personal AI Companion", divider='grey')
 if "history" not in st.session_state:
-    st.session_state.history = []
 for role, text in st.session_state.history:
-    st.chat_message("user" if role == "You" else "assistant").write(text)
-user_input = st.chat_input("Your message…")
 if user_input:
     st.chat_message("user").write(user_input)
     st.session_state.history.append(("You", user_input))
     recent = st.session_state.history[-CONTEXT_TURNS*2:]
-    context = "\n".join(t for _, t in recent)
-    prompt  = f"""{SYSTEM}
-Context:
-{context}
-User: {user_input}
-Assistant:"""
     with st.spinner("Thinking…"):
         try:
-            reply = generator(prompt)[0]["generated_text"].strip()
-            for marker in ["User:", "Assistant:"]:
-                if marker in reply:
-                    reply = reply.split(marker)[0].strip()
-            if not reply:
-                reply = "I’m sorry, I didn’t catch that. Could you rephrase?"
         except Exception as e:
-            reply = "I’m sorry, something went wrong."
-            st.error(f"Error: {e}")
     st.chat_message("assistant").write(reply)
     st.session_state.history.append(("Bot", reply))

 import os
 import streamlit as st
+from huggingface_hub import InferenceClient
 # ── Configuration ──────────────────────────────────────────────────────────
+HF_TOKEN        = os.getenv("HF_TOKEN")  # store your token in Space Secrets
+MODEL_ID        = "sourize/phi2-memory-lora"
+CONTEXT_TURNS   = 7
+MAX_NEW_TOKENS  = 128
+SYSTEM_PROMPT   = (
+    "You are a helpful assistant for DeepTalks with base Phi-2 "
+    "fine-tuned by Sourish.\n"
     "Answer **only** using the conversation context below.\n"
     "Do NOT output any lines beginning with 'User:' or 'Assistant:'.\n"
     "If you don't know, say \"I don't know.\"\n"
 )
+# ── HF Inference client ─────────────────────────────────────────────────────
+client = InferenceClient(token=HF_TOKEN)
+def query_hf(prompt: str) -> str:
+    out = client.text_generation(
+        model=MODEL_ID,
+        inputs=prompt,
+        parameters={
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "do_sample": True,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "return_full_text": False
+        },
     )
+    text = out.generated_text.strip()
+    # strip any stray markers
+    for marker in ["User:", "Assistant:"]:
+        if marker in text:
+            text = text.split(marker)[0].strip()
+    return text or "I don't know."
 # ── Streamlit UI ──────────────────────────────────────────────────────────
 st.set_page_config(layout="centered")
+st.title("🧠 DeepTalks (Inference API)")
+st.subheader("Your personal AI Companion")
 if "history" not in st.session_state:
+    st.session_state.history = []  # tuples of (role, text)
+# render history
 for role, text in st.session_state.history:
+    st.chat_message("user" if role=="You" else "assistant").write(text)
+# new input
+user_input = st.chat_input("Type your message…")
 if user_input:
     st.chat_message("user").write(user_input)
     st.session_state.history.append(("You", user_input))
+    # build context
     recent = st.session_state.history[-CONTEXT_TURNS*2:]
+    ctx = "\n".join(text for _, text in recent)
+    prompt = (
+        f"{SYSTEM_PROMPT}\n\n"
+        f"Context:\n{ctx}\n\n"
+        f"User: {user_input}\nAssistant:"
+    )
+    # call HF Inference API
     with st.spinner("Thinking…"):
         try:
+            reply = query_hf(prompt)
         except Exception as e:
+            reply = "Error generating response."
+            st.error(e)
     st.chat_message("assistant").write(reply)
     st.session_state.history.append(("Bot", reply))