Spaces:

janajankovic
/

chatbot

Sleeping

App Files Files Community

janajankovic commited on 27 days ago

Commit

8c4bae4

verified ·

1 Parent(s): 923efae

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -135

app.py CHANGED Viewed

@@ -1,175 +1,175 @@
 import os
 import gradio as gr
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from peft import PeftModel
-# -------------------------------------------------------------------
-# CONFIG
-# -------------------------------------------------------------------
-# Your fine-tuned adapter repo on HF
-MODEL_ID = "janajankovic/autotrain-juhh6-uwiv9"  # change if needed
-# Base model that was fine-tuned (the one you used in AutoTrain)
-BASE_MODEL_ID = "cjvt/GaMS-1B-Chat"  # change if different
-# CSV with chunks (already in the Space repo)
 CSV_PATH = "chunks_for_autotrain.csv"
-# How many *extra* chunks (besides the top-1) to add
-N_NEIGHBORS = 4
 MAX_NEW_TOKENS = 256
-TEMPERATURE = 0.7
-TOP_P = 0.9
-# -------------------------------------------------------------------
-# LOAD MODEL (BASE + PEFT ADAPTER)
-# -------------------------------------------------------------------
-print("Loading base model and tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
-base_model = AutoModelForCausalLM.from_pretrained(
-    BASE_MODEL_ID,
-    torch_dtype="auto",
-)
-# Attach LoRA / PEFT adapter
-print("Loading PEFT adapter...")
-model = PeftModel.from_pretrained(base_model, MODEL_ID)
-# Make sure pad token is set
-if model.config.pad_token_id is None and model.config.eos_token_id is not None:
-    model.config.pad_token_id = model.config.eos_token_id
-generator = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-)
-# -------------------------------------------------------------------
-# LOAD CHUNKS + BUILD TF-IDF RETRIEVER
-# -------------------------------------------------------------------
-print("Loading CSV chunks...")
 df = pd.read_csv(CSV_PATH)
-df["text"] = df["text"].fillna("")
-documents = df["text"].tolist()
-print("Building TF-IDF index...")
-vectorizer = TfidfVectorizer(max_features=50000)
-doc_matrix = vectorizer.fit_transform(documents)
-# -------------------------------------------------------------------
-# RETRIEVAL: TOP-1 + NEXT N_NEIGHBORS MOST SIMILAR CHUNKS
-# -------------------------------------------------------------------
-def retrieve_chunks(query: str, n_neighbors: int = N_NEIGHBORS):
-    query = query.strip()
-    if not query:
-        return []
-    # similarity of question vs all chunks
-    q_vec = vectorizer.transform([query])
-    sims = cosine_similarity(q_vec, doc_matrix).flatten()
-    if sims.max() <= 0:
-        return []
-    # indices sorted by similarity to the question (desc)
-    sorted_indices = sims.argsort()[::-1]
-    # central: most similar to question
-    central_idx = int(sorted_indices[0])
-    # neighbors: next n_neighbors most similar to question
-    neighbor_indices = [central_idx]
-    for idx in sorted_indices[1:]:
-        if len(neighbor_indices) >= n_neighbors + 1:
-            break
-        neighbor_indices.append(int(idx))
-    # keep order: central first, then neighbors
-    selected_texts = [documents[i] for i in neighbor_indices]
-    return selected_texts
-def build_context(question: str) -> str:
-    chunks = retrieve_chunks(question, N_NEIGHBORS)
-    if not chunks:
-        return ""
-    # Optional: prefix chunks for clarity (not strictly needed)
-    labelled = []
-    for i, ch in enumerate(chunks):
-        labelled.append(f"[CHUNK {i+1}]\n{ch}")
-    return "\n\n".join(labelled)
-# -------------------------------------------------------------------
-# CHAT FUNCTION
-# -------------------------------------------------------------------
 SYSTEM_PROMPT = (
-    "Ti si pomočnik, ki odgovarja v slovenščini.\n"
-    "Uporabi spodnji kontekst, če je relevanten. "
-    "Če kontekst ne vsebuje odgovora, odgovori po svojih najboljših močeh "
-    "in jasno povej, da se opiraš na splošno znanje.\n"
 )
-def generate_answer(message: str) -> str:
-    context = build_context(message)
-    if context:
-        full_prompt = (
-            f"{SYSTEM_PROMPT}\n"
-            f"Kontekst:\n{context}\n\n"
-            f"Vprašanje uporabnika:\n{message}\n\n"
-            f"Odgovor (v slovenščini):\n"
-        )
-    else:
-        full_prompt = (
-            f"{SYSTEM_PROMPT}\n"
-            f"Vprašanje uporabnika:\n{message}\n\n"
-            f"Odgovor (v slovenščini):\n"
-        )
-    outputs = generator(
-        full_prompt,
-        max_new_tokens=MAX_NEW_TOKENS,
-        do_sample=True,
-        temperature=TEMPERATURE,
-        top_p=TOP_P,
-        pad_token_id=model.config.pad_token_id,
-    )
-    generated = outputs[0]["generated_text"]
-    # strip the prompt from the beginning
-    answer = generated[len(full_prompt):].strip()
-    return answer
-def chat_fn(message, history):
-    return generate_answer(message)
-# -------------------------------------------------------------------
 # GRADIO UI
-# -------------------------------------------------------------------
 demo = gr.ChatInterface(
-    fn=chat_fn,
-    title="Gen-UI fine-tuned Slovene model",
-    description=(
-        "Klepet z lastnim fine-tunanim modelom.\n"
-        "Model samodejno poišče najbližje besedilne 'chunke' v CSV in jih uporabi kot kontekst."
-    ),
 )
 if __name__ == "__main__":
     demo.launch()

 import os
 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# ------------------------------------------------------------------
+# CONFIG – EDIT THESE TWO LINES TO MATCH YOUR REPOS
+# ------------------------------------------------------------------
+BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "cjvt/GaMS-1B-Chat")
+# Replace this with the name of YOUR fine-tuned adapter repo
+ADAPTER_ID = os.getenv("ADAPTER_ID", "janajankovic/autotrain-juhh6-uwiv9")
 CSV_PATH = "chunks_for_autotrain.csv"
+TOP_K = 4  # how many most similar chunks to use as context
+MAX_INPUT_LEN = 2048
 MAX_NEW_TOKENS = 256
+# ------------------------------------------------------------------
+# LOAD CSV CHUNKS + TF-IDF INDEX
+# ------------------------------------------------------------------
+if not os.path.exists(CSV_PATH):
+    raise FileNotFoundError(f"CSV file not found: {CSV_PATH}")
 df = pd.read_csv(CSV_PATH)
+# Try to guess which column holds the text
+if "chunk" in df.columns:
+    text_col = "chunk"
+elif "text" in df.columns:
+    text_col = "text"
+else:
+    # fallback: first column
+    text_col = df.columns[0]
+chunks = df[text_col].astype(str).tolist()
+if len(chunks) == 0:
+    raise ValueError("No chunks loaded from CSV – check the file content.")
+vectorizer = TfidfVectorizer(max_features=4096)
+tfidf_matrix = vectorizer.fit_transform(chunks)
+# ------------------------------------------------------------------
+# LOAD MODEL + TOKENIZER (BASE + LoRA ADAPTER)
+# ------------------------------------------------------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL_ID,
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+)
+model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
+# Merge LoRA into the base model so we can use it like a normal CausalLM
+model = model.merge_and_unload()
+model.to(device)
+model.eval()
+# ------------------------------------------------------------------
+# PROMPT + RETRIEVAL
+# ------------------------------------------------------------------
 SYSTEM_PROMPT = (
+    "Ti si pomočnik za učitelje in odgovarjaš v slovenščini. "
+    "Odgovarjaj kratko, jasno in brez ponavljanja istih fraz. "
+    "Če v podanih odlomkih ni odgovora, to jasno povej."
 )
+def retrieve_chunks(question: str, top_k: int = TOP_K):
+    """Return top_k most similar chunks for the given question."""
+    q_vec = vectorizer.transform([question])
+    sims = cosine_similarity(q_vec, tfidf_matrix)[0]
+    top_idx = sims.argsort()[::-1][:top_k]
+    return [chunks[i] for i in top_idx]
+def build_prompt(question: str, retrieved):
+    context = "\n\n---\n\n".join(retrieved)
+    prompt = (
+        f"{SYSTEM_PROMPT}\n\n"
+        f"Kontekst:\n{context}\n\n"
+        "Navodilo:\n"
+        "Na podlagi konteksta odgovori na vprašanje NA KRATKO (3–6 stavkov). "
+        "Ne ponavljaj istih besed ali stavkov.\n"
+        f"Vprašanje: {question}\n\n"
+        "Odgovor:"
+    )
+    return prompt
+# ------------------------------------------------------------------
+# GENERATION FUNCTION FOR CHAT
+# ------------------------------------------------------------------
+def generate_answer(message: str, history):
+    # 1) retrieve relevant chunks
+    retrieved = retrieve_chunks(message, top_k=TOP_K)
+    # 2) build prompt
+    prompt = build_prompt(message, retrieved)
+    # 3) tokenize
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_INPUT_LEN,
+    ).to(device)
+    # 4) generate with stronger anti-repetition settings
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            repetition_penalty=1.15,
+            no_repeat_ngram_size=4,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    # 5) strip the prompt part, decode only new tokens
+    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
+    raw_text = tokenizer.decode(
+        generated_ids,
+        skip_special_tokens=True,
+    ).strip()
+    # 6) small cleanup: remove very long runs of the same line
+    # (simple heuristic to kill the insane repetition cases)
+    lines = [l.strip() for l in raw_text.splitlines() if l.strip()]
+    cleaned = []
+    last_line = None
+    repeat_count = 0
+    for l in lines:
+        if l == last_line:
+            repeat_count += 1
+            if repeat_count >= 2:
+                # skip extra repetitions
+                continue
+        else:
+            repeat_count = 0
+            last_line = l
+        cleaned.append(l)
+    answer = " ".join(cleaned).strip()
+    return answer or raw_text
+# ------------------------------------------------------------------
 # GRADIO UI
+# ------------------------------------------------------------------
 demo = gr.ChatInterface(
+    fn=generate_answer,
+    title="GenUI – učiteljski pomočnik",
+    description="Klepetalnik, prilagojen na tvoje gradivo (CSV chunki).",
 )
 if __name__ == "__main__":
     demo.launch()