chatbot / app.py
janajankovic's picture
Update app.py
d6ba2e2 verified
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# ------------------------------------------------------------------
# CONFIG – EDIT THESE TWO LINES TO MATCH YOUR REPOS
# ------------------------------------------------------------------
BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "cjvt/GaMS-1B-Chat")
ADAPTER_ID = os.getenv("ADAPTER_ID", "janajankovic/autotrain-juhh6-uwiv9")
CSV_PATH = "chunks_for_autotrain.csv"
TOP_K = 4
MAX_INPUT_LEN = 2048
MAX_NEW_TOKENS = 256
# Enforce non-empty answers
MIN_NEW_TOKENS = 32 # prevent immediate EOS / 1-4 word outputs
MIN_CHARS = 60 # require roughly one sentence worth of text
MAX_RETRIES = 2
# ------------------------------------------------------------------
# LOAD CSV CHUNKS + TF-IDF INDEX
# ------------------------------------------------------------------
if not os.path.exists(CSV_PATH):
raise FileNotFoundError(f"CSV file not found: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)
if "chunk" in df.columns:
text_col = "chunk"
elif "text" in df.columns:
text_col = "text"
else:
text_col = df.columns[0]
chunks = df[text_col].astype(str).tolist()
if len(chunks) == 0:
raise ValueError("No chunks loaded from CSV – check the file content.")
vectorizer = TfidfVectorizer(max_features=4096)
tfidf_matrix = vectorizer.fit_transform(chunks)
# ------------------------------------------------------------------
# LOAD MODEL + TOKENIZER (BASE + LoRA ADAPTER)
# ------------------------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# CRITICAL: if prompt is too long, keep the END (question + "Odgovor:")
tokenizer.truncation_side = "left"
tokenizer.padding_side = "left"
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model = model.merge_and_unload()
model.to(device)
model.eval()
# ------------------------------------------------------------------
# PROMPT + RETRIEVAL
# ------------------------------------------------------------------
SYSTEM_PROMPT = (
"Ti si pomočnik za učitelje in odgovarjaš v slovenščini. "
"Odgovarjaj kratko, jasno in brez ponavljanja istih fraz. "
"Če v podanih odlomkih ni odgovora, to jasno povej."
)
def retrieve_chunks(question: str, top_k: int = TOP_K):
q_vec = vectorizer.transform([question])
sims = cosine_similarity(q_vec, tfidf_matrix)[0]
top_idx = sims.argsort()[::-1][:top_k]
return [chunks[i] for i in top_idx]
def build_prompt(question: str, retrieved):
context = "\n\n---\n\n".join(retrieved)
prompt = (
f"{SYSTEM_PROMPT}\n\n"
f"Kontekst:\n{context}\n\n"
"Navodilo:\n"
"Na podlagi konteksta odgovori na vprašanje NA KRATKO (3–6 stavkov). "
"Ne ponavljaj istih besed ali stavkov.\n"
f"Vprašanje: {question}\n\n"
"Odgovor:"
)
return prompt
# ------------------------------------------------------------------
# GENERATION FUNCTION FOR CHAT
# ------------------------------------------------------------------
def generate_answer(message: str, history):
retrieved = retrieve_chunks(message, top_k=TOP_K)
prompt = build_prompt(message, retrieved)
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=MAX_INPUT_LEN,
).to(device)
def _generate_once(gen_kwargs: dict) -> str:
with torch.no_grad():
out = model.generate(**inputs, **gen_kwargs)
gen_ids = out[0][inputs["input_ids"].shape[1]:]
return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
base_kwargs = dict(
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.15,
no_repeat_ngram_size=4,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Try to enforce minimum generation length (prevents 1–4 word answers).
try_kwargs = dict(base_kwargs)
try_kwargs["min_new_tokens"] = MIN_NEW_TOKENS
raw_text = ""
for _ in range(MAX_RETRIES + 1):
try:
raw_text = _generate_once(try_kwargs)
except TypeError:
# Older transformers: min_new_tokens not supported
raw_text = _generate_once(base_kwargs)
# Cleanup repeated identical lines
lines = [l.strip() for l in raw_text.splitlines() if l.strip()]
cleaned = []
last_line = None
rep = 0
for l in lines:
if l == last_line:
rep += 1
if rep >= 2:
continue
else:
rep = 0
last_line = l
cleaned.append(l)
answer = " ".join(cleaned).strip() or raw_text.strip()
# Accept if it looks like at least one sentence
if len(answer) >= MIN_CHARS and any(p in answer for p in ".!?"):
return answer
# Retry: loosen constraints a bit to avoid early stop / dead outputs
try_kwargs["temperature"] = min(0.95, try_kwargs.get("temperature", 0.7) + 0.15)
try_kwargs["top_p"] = min(0.98, try_kwargs.get("top_p", 0.9) + 0.05)
try_kwargs["repetition_penalty"] = max(1.05, try_kwargs.get("repetition_penalty", 1.15) - 0.05)
try_kwargs["no_repeat_ngram_size"] = max(2, try_kwargs.get("no_repeat_ngram_size", 4) - 1)
# Hard fallback: guarantees at least one full sentence
return "V podanih odlomkih ni dovolj informacij za zanesljiv odgovor na to vprašanje."
# ------------------------------------------------------------------
# GRADIO UI
# ------------------------------------------------------------------
demo = gr.ChatInterface(
fn=generate_answer,
title="GenUI – učiteljski pomočnik",
description="Klepetalnik, prilagojen na tvoje gradivo (CSV chunki).",
)
if __name__ == "__main__":
demo.launch()