import json import glob import os import random import pandas as pd from datetime import datetime from dotenv import load_dotenv # Core AI Libraries from sentence_transformers import SentenceTransformer, util from groq import Groq from datasets import load_dataset, Dataset # Load environment variables load_dotenv() # Initialize Groq client groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) # Load similarity model similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Config HF_DATASET_REPO = "midrees2806/unmatched_queries" HF_TOKEN = os.getenv("HF_TOKEN") # Greeting list GREETINGS = ["hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "aoa", "hi there", "hey there", "greetings"] # Load multiple JSON datasets dataset = [] try: json_files = glob.glob('datasets/*.json') for file_path in json_files: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, list): dataset.extend([item for item in data if isinstance(item, dict) and 'Question' in item and 'Answer' in item]) except Exception as e: print(f"Error loading datasets: {e}") # Precompute embeddings dataset_questions = [item.get("Question", "").lower().strip() for item in dataset] dataset_answers = [item.get("Answer", "") for item in dataset] dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) def manage_unmatched_queries(query: str): try: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) df = ds["train"].to_pandas() except: df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) if query not in df["Query"].values: new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) updated_ds = Dataset.from_pandas(df) updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) except Exception as e: print(f"Failed to save query: {e}") def query_groq_llm(prompt): try: # Llama-3.3-70b-versatile with streaming enabled completion = groq_client.chat.completions.create( model="llama-3.3-70b-versatile", messages=[ { "role": "system", "content": "You are the official UOE AI Assistant. Your goal is to provide highly structured, professional, and easy-to-read information about the University of Education Lahore." }, { "role": "user", "content": prompt } ], temperature=1, max_completion_tokens=1024, top_p=1, stream=True, stop=None ) full_response = "" for chunk in completion: content = chunk.choices[0].delta.content or "" full_response += content return full_response.strip() except Exception as e: print(f"Groq API Error: {e}") return None def get_best_answer(user_input): if not user_input.strip(): return "Please enter a valid question." user_input_lower = user_input.lower().strip() # Greeting Handling if any(greet == user_input_lower for greet in GREETINGS): return "Hello! I am the UOE AI Assistant. How can I help you regarding admissions, fees, or programs today?" # Length check if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS): return "Please provide more details. I need at least 3 words to understand your query properly." # # Direct Fee Link # if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "semester fees", "semester fee"]): # return ( # "💰 **Official Fee Structure**\n\n" # "For the most accurate and up-to-date fee details, please visit the official University of Education link:\n" # "🔗 https://ue.edu.pk/allfeestructure.php" # ) # Similarity Calculation user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True) similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] best_match_idx = similarities.argmax().item() best_score = similarities[best_match_idx].item() if best_score >= 0.65: # --- PATH 1: DATASET MATCH (Enhanced Formatting) --- original_answer = dataset_answers[best_match_idx] prompt = f"""You are the official University of Education (UOE) Assistant. I have found a verified answer in our database. Please rephrase it to make it extremely professional and user-friendly. STRICT GUIDELINES: 1. Use clear **Headings**. 2. Use **Bullet Points** for lists. 3. If there is data like timing, criteria, or contact info, present it in a **Markdown Table**. 4. Bold important keywords. 5. Maintain a polite and welcoming tone. User Question: {user_input} Verified Data: {original_answer} Enhanced Response:""" else: # --- PATH 2: GENERAL KNOWLEDGE (with Forwarding Note) --- manage_unmatched_queries(user_input) prompt = f"""You are the UOE AI Assistant for University of Education Lahore. The user asked: "{user_input}". This query is not in our verified database. Instructions: 1. Answer based on your general knowledge about UOE Lahore. 2. Format the response with headings and bold text. 3. At the end, you MUST add this exact text: "📢 *Note: Your query has been forwarded to our support team. We are currently updating our verified database to provide a verified answer next time.*" 4. List official contacts: Website: https://ue.edu.pk | Phone: +92-42-99262231-33""" return query_groq_llm(prompt)