Update app.py
Browse files
app.py
CHANGED
|
@@ -15,11 +15,13 @@ import threading
|
|
| 15 |
from sentence_transformers import SentenceTransformer
|
| 16 |
import numpy as np
|
| 17 |
import faiss
|
|
|
|
| 18 |
|
| 19 |
# --- Konfiguration ---
|
| 20 |
CHARGENODE_URL = "https://www.chargenode.eu"
|
| 21 |
-
MAX_CHUNK_SIZE =
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
# Kontrollera om vi kör i Hugging Face-miljön
|
| 25 |
IS_HUGGINGFACE = os.environ.get("SPACE_ID") is not None
|
|
@@ -69,6 +71,7 @@ embeddings = None
|
|
| 69 |
index = None
|
| 70 |
chunks = []
|
| 71 |
chunk_sources = []
|
|
|
|
| 72 |
|
| 73 |
# --- Förbättrad loggfunktion ---
|
| 74 |
def safe_append_to_log(log_entry):
|
|
@@ -163,29 +166,117 @@ def load_prompt():
|
|
| 163 |
print(f"Fel vid inläsning av prompt.txt: {e}, använder standardprompt")
|
| 164 |
return "Du är ChargeNode's AI-assistent. Svara på frågor om ChargeNode's produkter och tjänster baserat på den tillhandahållna informationen."
|
| 165 |
|
| 166 |
-
#
|
| 167 |
def prepare_chunks(text_data):
|
| 168 |
-
"""Delar upp texten i mindre segment för embedding och sökning."""
|
| 169 |
chunks, sources = [], []
|
|
|
|
|
|
|
| 170 |
for source, text in text_data.items():
|
|
|
|
| 171 |
paragraphs = [p for p in text.split("\n") if p.strip()]
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
else:
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
return chunks, sources
|
| 185 |
|
| 186 |
def initialize_embeddings():
|
| 187 |
"""Initierar SentenceTransformer och FAISS-index vid första anrop."""
|
| 188 |
-
global embedder, embeddings, index, chunks, chunk_sources
|
| 189 |
|
| 190 |
if embedder is None:
|
| 191 |
print("Initierar SentenceTransformer och FAISS-index...")
|
|
@@ -203,12 +294,62 @@ def initialize_embeddings():
|
|
| 203 |
index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 204 |
index.add(embeddings)
|
| 205 |
print("FAISS-index klart")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
def retrieve_context(query, k=RETRIEVAL_K):
|
| 208 |
-
"""Hämtar relevant kontext för frågor."""
|
| 209 |
# Säkerställ att modeller är laddade
|
| 210 |
initialize_embeddings()
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
query_embedding = embedder.encode([query], convert_to_numpy=True)
|
| 213 |
query_embedding /= np.linalg.norm(query_embedding)
|
| 214 |
D, I = index.search(query_embedding, k)
|
|
|
|
| 15 |
from sentence_transformers import SentenceTransformer
|
| 16 |
import numpy as np
|
| 17 |
import faiss
|
| 18 |
+
import re
|
| 19 |
|
| 20 |
# --- Konfiguration ---
|
| 21 |
CHARGENODE_URL = "https://www.chargenode.eu"
|
| 22 |
+
MAX_CHUNK_SIZE = 2000 # Ökad chunkstorleken för att bättre hantera FAQ-svar
|
| 23 |
+
CHUNK_OVERLAP = 200 # Nytt: Overlapping chunks för att inte tappa kontext
|
| 24 |
+
RETRIEVAL_K = 5 # Antal chunker att hämta vid varje sökning
|
| 25 |
|
| 26 |
# Kontrollera om vi kör i Hugging Face-miljön
|
| 27 |
IS_HUGGINGFACE = os.environ.get("SPACE_ID") is not None
|
|
|
|
| 71 |
index = None
|
| 72 |
chunks = []
|
| 73 |
chunk_sources = []
|
| 74 |
+
faq_dict = {} # Ny: Dictionary för direktmatchning av vanliga frågor
|
| 75 |
|
| 76 |
# --- Förbättrad loggfunktion ---
|
| 77 |
def safe_append_to_log(log_entry):
|
|
|
|
| 166 |
print(f"Fel vid inläsning av prompt.txt: {e}, använder standardprompt")
|
| 167 |
return "Du är ChargeNode's AI-assistent. Svara på frågor om ChargeNode's produkter och tjänster baserat på den tillhandahållna informationen."
|
| 168 |
|
| 169 |
+
# --- Förbättrad chunking ---
|
| 170 |
def prepare_chunks(text_data):
|
| 171 |
+
"""Delar upp texten i mindre segment för embedding och sökning med särskild hänsyn till FAQ-format."""
|
| 172 |
chunks, sources = [], []
|
| 173 |
+
global faq_dict
|
| 174 |
+
|
| 175 |
for source, text in text_data.items():
|
| 176 |
+
# Split text into paragraph-sized chunks
|
| 177 |
paragraphs = [p for p in text.split("\n") if p.strip()]
|
| 178 |
+
|
| 179 |
+
# Process FAQ-specific content better
|
| 180 |
+
i = 0
|
| 181 |
+
while i < len(paragraphs):
|
| 182 |
+
# Start a new chunk
|
| 183 |
+
current_chunk = ""
|
| 184 |
+
start_idx = i
|
| 185 |
+
|
| 186 |
+
# Check for FAQ format
|
| 187 |
+
if i < len(paragraphs) and paragraphs[i].startswith("Fråga:"):
|
| 188 |
+
question = paragraphs[i][7:].strip() # Extract the question text
|
| 189 |
+
current_chunk = paragraphs[i]
|
| 190 |
+
i += 1
|
| 191 |
+
|
| 192 |
+
# Add content until we reach the next question or MAX_CHUNK_SIZE
|
| 193 |
+
while i < len(paragraphs) and not paragraphs[i].startswith("Fråga:"):
|
| 194 |
+
# Add this paragraph if it doesn't exceed chunk size
|
| 195 |
+
if len(current_chunk) + len(paragraphs[i]) + 1 <= MAX_CHUNK_SIZE:
|
| 196 |
+
current_chunk += "\n" + paragraphs[i]
|
| 197 |
+
else:
|
| 198 |
+
# If we're already processing a FAQ answer, don't break mid-answer
|
| 199 |
+
if "Svar:" in current_chunk:
|
| 200 |
+
# We prefer to keep whole answers together, so let's break only if answer is too long
|
| 201 |
+
if len(current_chunk) > MAX_CHUNK_SIZE * 1.5: # Allow some overflow
|
| 202 |
+
break
|
| 203 |
+
else:
|
| 204 |
+
current_chunk += "\n" + paragraphs[i]
|
| 205 |
+
else:
|
| 206 |
+
break
|
| 207 |
+
i += 1
|
| 208 |
+
|
| 209 |
+
# Store FAQ pairs in the dictionary for direct lookup
|
| 210 |
+
if "Svar:" in current_chunk:
|
| 211 |
+
answer_start = current_chunk.find("Svar:")
|
| 212 |
+
answer_text = current_chunk[answer_start + 5:].strip()
|
| 213 |
+
|
| 214 |
+
# Add variations with common synonyms for payment-related questions
|
| 215 |
+
if any(term in question.lower() for term in ["betalsätt", "betalmetod", "betalmedel", "kort",
|
| 216 |
+
"betalkort", "betalning", "betala"]):
|
| 217 |
+
payment_variations = [
|
| 218 |
+
"hur ändrar jag betalmedel",
|
| 219 |
+
"hur byter jag betalsätt",
|
| 220 |
+
"hur uppdaterar jag mitt betalkort",
|
| 221 |
+
"hur ändrar jag betalmetod",
|
| 222 |
+
"hur byter jag betalningsmetod",
|
| 223 |
+
"hur ändrar jag betalkort"
|
| 224 |
+
]
|
| 225 |
+
for variation in payment_variations:
|
| 226 |
+
faq_dict[variation] = answer_text
|
| 227 |
+
|
| 228 |
+
# Add the original question to the dictionary
|
| 229 |
+
faq_dict[question.lower()] = answer_text
|
| 230 |
else:
|
| 231 |
+
# Handle non-FAQ text using sliding window
|
| 232 |
+
while i < len(paragraphs) and len(current_chunk) + len(paragraphs[i]) + 1 <= MAX_CHUNK_SIZE:
|
| 233 |
+
if current_chunk:
|
| 234 |
+
current_chunk += " " + paragraphs[i]
|
| 235 |
+
else:
|
| 236 |
+
current_chunk = paragraphs[i]
|
| 237 |
+
i += 1
|
| 238 |
+
|
| 239 |
+
# Save the chunk if it has content
|
| 240 |
+
if current_chunk.strip():
|
| 241 |
+
chunks.append(current_chunk.strip())
|
| 242 |
+
sources.append(source)
|
| 243 |
+
|
| 244 |
+
# If we've added a chunk but haven't advanced, we need to move forward
|
| 245 |
+
if i == start_idx:
|
| 246 |
+
i += 1
|
| 247 |
+
|
| 248 |
+
# Create overlapping chunks for better context preservation
|
| 249 |
+
overlap_chunks = []
|
| 250 |
+
overlap_sources = []
|
| 251 |
+
|
| 252 |
+
for j in range(0, len(chunks)):
|
| 253 |
+
overlap_chunks.append(chunks[j])
|
| 254 |
+
overlap_sources.append(sources[j])
|
| 255 |
+
|
| 256 |
+
# Create an overlapping chunk with the next chunk if it exists
|
| 257 |
+
if j < len(chunks) - 1 and chunks[j].endswith(chunks[j+1][:CHUNK_OVERLAP]):
|
| 258 |
+
# Skip if there's already significant overlap
|
| 259 |
+
continue
|
| 260 |
+
|
| 261 |
+
if j < len(chunks) - 1:
|
| 262 |
+
# Calculate available space in the current chunk
|
| 263 |
+
space_left = MAX_CHUNK_SIZE - len(chunks[j])
|
| 264 |
+
|
| 265 |
+
# If there's enough space, add part of the next chunk
|
| 266 |
+
if space_left >= CHUNK_OVERLAP:
|
| 267 |
+
overlap_text = chunks[j] + " " + chunks[j+1][:CHUNK_OVERLAP]
|
| 268 |
+
overlap_chunks.append(overlap_text)
|
| 269 |
+
overlap_sources.append(sources[j])
|
| 270 |
+
|
| 271 |
+
chunks = overlap_chunks
|
| 272 |
+
sources = overlap_sources
|
| 273 |
+
|
| 274 |
+
print(f"Genererade {len(chunks)} chunks med {len(faq_dict)} FAQ-par")
|
| 275 |
return chunks, sources
|
| 276 |
|
| 277 |
def initialize_embeddings():
|
| 278 |
"""Initierar SentenceTransformer och FAISS-index vid första anrop."""
|
| 279 |
+
global embedder, embeddings, index, chunks, chunk_sources, faq_dict
|
| 280 |
|
| 281 |
if embedder is None:
|
| 282 |
print("Initierar SentenceTransformer och FAISS-index...")
|
|
|
|
| 294 |
index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 295 |
index.add(embeddings)
|
| 296 |
print("FAISS-index klart")
|
| 297 |
+
|
| 298 |
+
# Print FAQ dictionary keys for debugging
|
| 299 |
+
print(f"FAQ Dictionary innehåller {len(faq_dict)} nycklar")
|
| 300 |
+
if len(faq_dict) > 0:
|
| 301 |
+
payment_keys = [k for k in faq_dict.keys() if any(term in k for term in ["betalsätt", "betalmetod", "betalmedel"])]
|
| 302 |
+
print(f"Betalningsrelaterade FAQ-nycklar: {payment_keys[:5]}")
|
| 303 |
+
|
| 304 |
+
# Direkt matchningsfunktion för vanliga frågor
|
| 305 |
+
def check_direct_match(query):
|
| 306 |
+
"""Kontrollerar om frågan matchar någon av våra fördefinierade FAQ-svar."""
|
| 307 |
+
query_lower = query.lower().strip('?').strip()
|
| 308 |
+
|
| 309 |
+
# Explicit check for payment method question
|
| 310 |
+
if any(query_lower.startswith(prefix) for prefix in ["hur ändrar jag", "hur byter jag", "hur uppdaterar jag"]) and \
|
| 311 |
+
any(term in query_lower for term in ["betalsätt", "betalmetod", "betalmedel", "betalkort", "kort"]):
|
| 312 |
+
payment_answer = """Så här gör du om du vill byta betalkort:
|
| 313 |
+
1. Gå in i appen.
|
| 314 |
+
2. Tryck på meny och mina betalsätt
|
| 315 |
+
3. Tryck på ersätt kort.
|
| 316 |
+
4. Godkänn våra villkor
|
| 317 |
+
5. Tryck på kortbetalning under "bekräfta för auktorisering"
|
| 318 |
+
6. Lägg in dina nya kort uppgifter
|
| 319 |
+
7. Bekräfta med BankID.
|
| 320 |
+
|
| 321 |
+
OBS! Se till att kortet har pengar och att det är upplåst för internetbetalningar."""
|
| 322 |
+
return payment_answer
|
| 323 |
+
|
| 324 |
+
# Check if query directly matches a FAQ
|
| 325 |
+
if query_lower in faq_dict:
|
| 326 |
+
return faq_dict[query_lower]
|
| 327 |
+
|
| 328 |
+
# Check for close matches using pattern matching
|
| 329 |
+
for key, value in faq_dict.items():
|
| 330 |
+
# Find questions about changing things with synonyms
|
| 331 |
+
if ("ändra" in query_lower or "byta" in query_lower or "uppdatera" in query_lower) and \
|
| 332 |
+
("ändra" in key or "byta" in key or "uppdatera" in key):
|
| 333 |
+
# Check if key and query share important terms
|
| 334 |
+
query_terms = set(query_lower.split())
|
| 335 |
+
key_terms = set(key.split())
|
| 336 |
+
if len(query_terms.intersection(key_terms)) >= 2: # At least 2 words in common
|
| 337 |
+
return value
|
| 338 |
+
|
| 339 |
+
return None
|
| 340 |
|
| 341 |
def retrieve_context(query, k=RETRIEVAL_K):
|
| 342 |
+
"""Hämtar relevant kontext för frågor med direkt matchning för vanliga frågor."""
|
| 343 |
# Säkerställ att modeller är laddade
|
| 344 |
initialize_embeddings()
|
| 345 |
|
| 346 |
+
# Först, kolla efter direktmatchningar för vanliga frågor
|
| 347 |
+
direct_match = check_direct_match(query)
|
| 348 |
+
if direct_match:
|
| 349 |
+
print(f"Direkt matchning hittad för frågan: {query}")
|
| 350 |
+
return f"Fråga: {query}\nSvar: {direct_match}", ["direct_match"]
|
| 351 |
+
|
| 352 |
+
# Om ingen direktmatchning, använd vanlig embedding-sökning
|
| 353 |
query_embedding = embedder.encode([query], convert_to_numpy=True)
|
| 354 |
query_embedding /= np.linalg.norm(query_embedding)
|
| 355 |
D, I = index.search(query_embedding, k)
|