import ollama import re from config import MODEL_NAME from langdetect import detect import pandas as pd #client = ollama.Client(timeout=60) #def augment_question(question: str, lang: str, n: int = 3): # # system_prompt = ( # "You generate alternative user questions.\n" # "Rules:\n" # "- Keep the SAME meaning\n" # "- Do NOT answer\n" # "- Output ONLY a list\n" # "- Each line is ONE question\n" # f"- Generate exactly {n} variations\n" # ) # if lang == "ar": # system_prompt += "- Use Arabic conversational style\n" # prompt = f"{system_prompt}\nOriginal question:\n{question}" # response = client.generate( # model=MODEL_NAME, # prompt=prompt, # options={"temperature": 0.3} # ) # text = response["response"] # # Clean output # questions = [] # for line in text.split("\n"): # line = re.sub(r"^[\-\*\d\.\)]\s*", "", line).strip() # if line and line.lower() != question.lower(): # questions.append(line) # return questions[:n] def augment_question_smart(question: str, lang: str, n: int = 5): system_prompt = ( "Generate alternative ways to ask the same question.\n" "Rules:\n" "- Keep EXACT same meaning\n" "- Use different wordings and structures\n" "- Include formal and informal versions\n" "- Include questions with typos/mistakes users might make\n" f"- Generate exactly {n} variations\n" "- Output ONLY the questions, one per line\n" ) if lang == "ar": system_prompt += """ - استخدم اللهجة المصرية والفصحى - أضف أخطاء إملائية شائعة - استخدم صيغ مختلفة (ماذا، كيف، هل، ممكن، عايز) - حافظ على نفس المعى - استخدم كلمات مختلفة تعطى نفس المعنى - اخلق طرق بديلة لصياغة نفس السؤال - اخلق طرق بديلة لصياغة نفس السؤال باللغة العربية الفصحى واللهجة الدارجة """ prompt = f"{system_prompt}\n\nالسؤال الأصلي:\n{question}" response = ollama.generate( model=MODEL_NAME, prompt=prompt, options={"temperature": 0.5} # higher temp for diversity ) text = response["response"] questions = [] for line in text.split("\n"): line = re.sub(r"^[\-\*\d\.\)]\s*", "", line).strip() if line and line.lower() != question.lower(): questions.append(line) return questions[:n] # update loader def load_csv_with_augmentation(path): # load csv and augment df = pd.read_csv(path) df.columns = [c.strip().lower() for c in df.columns] chunks = [] for _, row in df.iterrows(): q = str(row["question"]).strip() a = str(row["answer"]).strip() if not q or not a: continue # 0riginal chunks.append(f"Question: {q}\nAnswer: {a}") # detect language lang = detect(q) # add augmented one augmented = augment_question_smart(q, lang, n=5) ## 6 for aug_q in augmented: chunks.append(f"Question: {aug_q}\nAnswer: {a}") return chunks