janajankovic commited on
Commit
8c4bae4
·
verified ·
1 Parent(s): 923efae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -135
app.py CHANGED
@@ -1,175 +1,175 @@
1
  import os
2
 
3
  import gradio as gr
 
 
 
4
  import pandas as pd
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
 
8
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
- from peft import PeftModel
10
-
11
- # -------------------------------------------------------------------
12
- # CONFIG
13
- # -------------------------------------------------------------------
14
-
15
- # Your fine-tuned adapter repo on HF
16
- MODEL_ID = "janajankovic/autotrain-juhh6-uwiv9" # change if needed
17
 
18
- # Base model that was fine-tuned (the one you used in AutoTrain)
19
- BASE_MODEL_ID = "cjvt/GaMS-1B-Chat" # change if different
 
 
 
 
20
 
21
- # CSV with chunks (already in the Space repo)
22
  CSV_PATH = "chunks_for_autotrain.csv"
23
-
24
- # How many *extra* chunks (besides the top-1) to add
25
- N_NEIGHBORS = 4
26
-
27
  MAX_NEW_TOKENS = 256
28
- TEMPERATURE = 0.7
29
- TOP_P = 0.9
30
-
31
- # -------------------------------------------------------------------
32
- # LOAD MODEL (BASE + PEFT ADAPTER)
33
- # -------------------------------------------------------------------
34
-
35
- print("Loading base model and tokenizer...")
36
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
37
 
38
- base_model = AutoModelForCausalLM.from_pretrained(
39
- BASE_MODEL_ID,
40
- torch_dtype="auto",
41
- )
42
-
43
- # Attach LoRA / PEFT adapter
44
- print("Loading PEFT adapter...")
45
- model = PeftModel.from_pretrained(base_model, MODEL_ID)
46
-
47
- # Make sure pad token is set
48
- if model.config.pad_token_id is None and model.config.eos_token_id is not None:
49
- model.config.pad_token_id = model.config.eos_token_id
50
-
51
- generator = pipeline(
52
- "text-generation",
53
- model=model,
54
- tokenizer=tokenizer,
55
- )
56
 
57
- # -------------------------------------------------------------------
58
- # LOAD CHUNKS + BUILD TF-IDF RETRIEVER
59
- # -------------------------------------------------------------------
 
 
60
 
61
- print("Loading CSV chunks...")
62
  df = pd.read_csv(CSV_PATH)
63
- df["text"] = df["text"].fillna("")
64
 
65
- documents = df["text"].tolist()
 
 
 
 
 
 
 
66
 
67
- print("Building TF-IDF index...")
68
- vectorizer = TfidfVectorizer(max_features=50000)
69
- doc_matrix = vectorizer.fit_transform(documents)
70
 
71
- # -------------------------------------------------------------------
72
- # RETRIEVAL: TOP-1 + NEXT N_NEIGHBORS MOST SIMILAR CHUNKS
73
- # -------------------------------------------------------------------
74
 
75
- def retrieve_chunks(query: str, n_neighbors: int = N_NEIGHBORS):
76
- query = query.strip()
77
- if not query:
78
- return []
79
 
80
- # similarity of question vs all chunks
81
- q_vec = vectorizer.transform([query])
82
- sims = cosine_similarity(q_vec, doc_matrix).flatten()
83
 
84
- if sims.max() <= 0:
85
- return []
 
 
86
 
87
- # indices sorted by similarity to the question (desc)
88
- sorted_indices = sims.argsort()[::-1]
 
89
 
90
- # central: most similar to question
91
- central_idx = int(sorted_indices[0])
92
-
93
- # neighbors: next n_neighbors most similar to question
94
- neighbor_indices = [central_idx]
95
- for idx in sorted_indices[1:]:
96
- if len(neighbor_indices) >= n_neighbors + 1:
97
- break
98
- neighbor_indices.append(int(idx))
99
-
100
- # keep order: central first, then neighbors
101
- selected_texts = [documents[i] for i in neighbor_indices]
102
- return selected_texts
103
-
104
- def build_context(question: str) -> str:
105
- chunks = retrieve_chunks(question, N_NEIGHBORS)
106
- if not chunks:
107
- return ""
108
 
109
- # Optional: prefix chunks for clarity (not strictly needed)
110
- labelled = []
111
- for i, ch in enumerate(chunks):
112
- labelled.append(f"[CHUNK {i+1}]\n{ch}")
113
- return "\n\n".join(labelled)
114
 
115
- # -------------------------------------------------------------------
116
- # CHAT FUNCTION
117
- # -------------------------------------------------------------------
118
 
 
 
 
119
  SYSTEM_PROMPT = (
120
- "Ti si pomočnik, ki odgovarja v slovenščini.\n"
121
- "Uporabi spodnji kontekst, če je relevanten. "
122
- "Če kontekst ne vsebuje odgovora, odgovori po svojih najboljših močeh "
123
- "in jasno povej, da se opiraš na splošno znanje.\n"
124
  )
125
 
126
- def generate_answer(message: str) -> str:
127
- context = build_context(message)
128
-
129
- if context:
130
- full_prompt = (
131
- f"{SYSTEM_PROMPT}\n"
132
- f"Kontekst:\n{context}\n\n"
133
- f"Vprašanje uporabnika:\n{message}\n\n"
134
- f"Odgovor (v slovenščini):\n"
135
- )
136
- else:
137
- full_prompt = (
138
- f"{SYSTEM_PROMPT}\n"
139
- f"Vprašanje uporabnika:\n{message}\n\n"
140
- f"Odgovor (v slovenščini):\n"
141
- )
142
 
143
- outputs = generator(
144
- full_prompt,
145
- max_new_tokens=MAX_NEW_TOKENS,
146
- do_sample=True,
147
- temperature=TEMPERATURE,
148
- top_p=TOP_P,
149
- pad_token_id=model.config.pad_token_id,
150
- )
151
 
152
- generated = outputs[0]["generated_text"]
153
 
154
- # strip the prompt from the beginning
155
- answer = generated[len(full_prompt):].strip()
156
- return answer
157
 
158
- def chat_fn(message, history):
159
- return generate_answer(message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # -------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  # GRADIO UI
163
- # -------------------------------------------------------------------
164
-
165
  demo = gr.ChatInterface(
166
- fn=chat_fn,
167
- title="Gen-UI fine-tuned Slovene model",
168
- description=(
169
- "Klepet z lastnim fine-tunanim modelom.\n"
170
- "Model samodejno poišče najbližje besedilne 'chunke' v CSV in jih uporabi kot kontekst."
171
- ),
172
  )
173
 
 
174
  if __name__ == "__main__":
175
  demo.launch()
 
1
  import os
2
 
3
  import gradio as gr
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from peft import PeftModel
7
  import pandas as pd
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # ------------------------------------------------------------------
13
+ # CONFIG EDIT THESE TWO LINES TO MATCH YOUR REPOS
14
+ # ------------------------------------------------------------------
15
+ BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "cjvt/GaMS-1B-Chat")
16
+ # Replace this with the name of YOUR fine-tuned adapter repo
17
+ ADAPTER_ID = os.getenv("ADAPTER_ID", "janajankovic/autotrain-juhh6-uwiv9")
18
 
 
19
  CSV_PATH = "chunks_for_autotrain.csv"
20
+ TOP_K = 4 # how many most similar chunks to use as context
21
+ MAX_INPUT_LEN = 2048
 
 
22
  MAX_NEW_TOKENS = 256
 
 
 
 
 
 
 
 
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # ------------------------------------------------------------------
26
+ # LOAD CSV CHUNKS + TF-IDF INDEX
27
+ # ------------------------------------------------------------------
28
+ if not os.path.exists(CSV_PATH):
29
+ raise FileNotFoundError(f"CSV file not found: {CSV_PATH}")
30
 
 
31
  df = pd.read_csv(CSV_PATH)
 
32
 
33
+ # Try to guess which column holds the text
34
+ if "chunk" in df.columns:
35
+ text_col = "chunk"
36
+ elif "text" in df.columns:
37
+ text_col = "text"
38
+ else:
39
+ # fallback: first column
40
+ text_col = df.columns[0]
41
 
42
+ chunks = df[text_col].astype(str).tolist()
 
 
43
 
44
+ if len(chunks) == 0:
45
+ raise ValueError("No chunks loaded from CSV check the file content.")
 
46
 
47
+ vectorizer = TfidfVectorizer(max_features=4096)
48
+ tfidf_matrix = vectorizer.fit_transform(chunks)
 
 
49
 
 
 
 
50
 
51
+ # ------------------------------------------------------------------
52
+ # LOAD MODEL + TOKENIZER (BASE + LoRA ADAPTER)
53
+ # ------------------------------------------------------------------
54
+ device = "cuda" if torch.cuda.is_available() else "cpu"
55
 
56
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
57
+ if tokenizer.pad_token is None:
58
+ tokenizer.pad_token = tokenizer.eos_token
59
 
60
+ base_model = AutoModelForCausalLM.from_pretrained(
61
+ BASE_MODEL_ID,
62
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
63
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
66
+ # Merge LoRA into the base model so we can use it like a normal CausalLM
67
+ model = model.merge_and_unload()
68
+ model.to(device)
69
+ model.eval()
70
 
 
 
 
71
 
72
+ # ------------------------------------------------------------------
73
+ # PROMPT + RETRIEVAL
74
+ # ------------------------------------------------------------------
75
  SYSTEM_PROMPT = (
76
+ "Ti si pomočnik za učitelje in odgovarjaš v slovenščini. "
77
+ "Odgovarjaj kratko, jasno in brez ponavljanja istih fraz. "
78
+ "Če v podanih odlomkih ni odgovora, to jasno povej."
 
79
  )
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ def retrieve_chunks(question: str, top_k: int = TOP_K):
83
+ """Return top_k most similar chunks for the given question."""
84
+ q_vec = vectorizer.transform([question])
85
+ sims = cosine_similarity(q_vec, tfidf_matrix)[0]
86
+ top_idx = sims.argsort()[::-1][:top_k]
87
+ return [chunks[i] for i in top_idx]
 
 
88
 
 
89
 
90
+ def build_prompt(question: str, retrieved):
91
+ context = "\n\n---\n\n".join(retrieved)
 
92
 
93
+ prompt = (
94
+ f"{SYSTEM_PROMPT}\n\n"
95
+ f"Kontekst:\n{context}\n\n"
96
+ "Navodilo:\n"
97
+ "Na podlagi konteksta odgovori na vprašanje NA KRATKO (3–6 stavkov). "
98
+ "Ne ponavljaj istih besed ali stavkov.\n"
99
+ f"Vprašanje: {question}\n\n"
100
+ "Odgovor:"
101
+ )
102
+ return prompt
103
+
104
+
105
+ # ------------------------------------------------------------------
106
+ # GENERATION FUNCTION FOR CHAT
107
+ # ------------------------------------------------------------------
108
+ def generate_answer(message: str, history):
109
+ # 1) retrieve relevant chunks
110
+ retrieved = retrieve_chunks(message, top_k=TOP_K)
111
+
112
+ # 2) build prompt
113
+ prompt = build_prompt(message, retrieved)
114
+
115
+ # 3) tokenize
116
+ inputs = tokenizer(
117
+ prompt,
118
+ return_tensors="pt",
119
+ truncation=True,
120
+ max_length=MAX_INPUT_LEN,
121
+ ).to(device)
122
+
123
+ # 4) generate with stronger anti-repetition settings
124
+ with torch.no_grad():
125
+ output_ids = model.generate(
126
+ **inputs,
127
+ max_new_tokens=MAX_NEW_TOKENS,
128
+ do_sample=True,
129
+ temperature=0.7,
130
+ top_p=0.9,
131
+ repetition_penalty=1.15,
132
+ no_repeat_ngram_size=4,
133
+ pad_token_id=tokenizer.eos_token_id,
134
+ )
135
 
136
+ # 5) strip the prompt part, decode only new tokens
137
+ generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
138
+ raw_text = tokenizer.decode(
139
+ generated_ids,
140
+ skip_special_tokens=True,
141
+ ).strip()
142
+
143
+ # 6) small cleanup: remove very long runs of the same line
144
+ # (simple heuristic to kill the insane repetition cases)
145
+ lines = [l.strip() for l in raw_text.splitlines() if l.strip()]
146
+ cleaned = []
147
+ last_line = None
148
+ repeat_count = 0
149
+ for l in lines:
150
+ if l == last_line:
151
+ repeat_count += 1
152
+ if repeat_count >= 2:
153
+ # skip extra repetitions
154
+ continue
155
+ else:
156
+ repeat_count = 0
157
+ last_line = l
158
+ cleaned.append(l)
159
+
160
+ answer = " ".join(cleaned).strip()
161
+ return answer or raw_text
162
+
163
+
164
+ # ------------------------------------------------------------------
165
  # GRADIO UI
166
+ # ------------------------------------------------------------------
 
167
  demo = gr.ChatInterface(
168
+ fn=generate_answer,
169
+ title="GenUI učiteljski pomočnik",
170
+ description="Klepetalnik, prilagojen na tvoje gradivo (CSV chunki).",
 
 
 
171
  )
172
 
173
+
174
  if __name__ == "__main__":
175
  demo.launch()