Spaces:
Sleeping
Sleeping
| import json | |
| import glob | |
| import os | |
| import random | |
| import pandas as pd | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| # Core AI Libraries | |
| from sentence_transformers import SentenceTransformer, util | |
| from groq import Groq | |
| from datasets import load_dataset, Dataset | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize Groq client | |
| groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| # Load similarity model | |
| similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') | |
| # Config | |
| HF_DATASET_REPO = "midrees2806/unmatched_queries" | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Greeting list | |
| GREETINGS = ["hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "aoa", "hi there", "hey there", "greetings"] | |
| # Load multiple JSON datasets | |
| dataset = [] | |
| try: | |
| json_files = glob.glob('datasets/*.json') | |
| for file_path in json_files: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| if isinstance(data, list): | |
| dataset.extend([item for item in data if isinstance(item, dict) and 'Question' in item and 'Answer' in item]) | |
| except Exception as e: | |
| print(f"Error loading datasets: {e}") | |
| # Precompute embeddings | |
| dataset_questions = [item.get("Question", "").lower().strip() for item in dataset] | |
| dataset_answers = [item.get("Answer", "") for item in dataset] | |
| dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True) | |
| def manage_unmatched_queries(query: str): | |
| try: | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| try: | |
| ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN) | |
| df = ds["train"].to_pandas() | |
| except: | |
| df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"]) | |
| if query not in df["Query"].values: | |
| new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False} | |
| df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) | |
| updated_ds = Dataset.from_pandas(df) | |
| updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN) | |
| except Exception as e: | |
| print(f"Failed to save query: {e}") | |
| def query_groq_llm(prompt): | |
| try: | |
| # Llama-3.3-70b-versatile with streaming enabled | |
| completion = groq_client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are the official UOE AI Assistant. Your goal is to provide highly structured, professional, and easy-to-read information about the University of Education Lahore." | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| temperature=1, | |
| max_completion_tokens=1024, | |
| top_p=1, | |
| stream=True, | |
| stop=None | |
| ) | |
| full_response = "" | |
| for chunk in completion: | |
| content = chunk.choices[0].delta.content or "" | |
| full_response += content | |
| return full_response.strip() | |
| except Exception as e: | |
| print(f"Groq API Error: {e}") | |
| return None | |
| def get_best_answer(user_input): | |
| if not user_input.strip(): | |
| return "Please enter a valid question." | |
| user_input_lower = user_input.lower().strip() | |
| # Greeting Handling | |
| if any(greet == user_input_lower for greet in GREETINGS): | |
| return "Hello! I am the UOE AI Assistant. How can I help you regarding admissions, fees, or programs today?" | |
| # Length check | |
| if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS): | |
| return "Please provide more details. I need at least 3 words to understand your query properly." | |
| # # Direct Fee Link | |
| # if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "semester fees", "semester fee"]): | |
| # return ( | |
| # "💰 **Official Fee Structure**\n\n" | |
| # "For the most accurate and up-to-date fee details, please visit the official University of Education link:\n" | |
| # "🔗 https://ue.edu.pk/allfeestructure.php" | |
| # ) | |
| # Similarity Calculation | |
| user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True) | |
| similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0] | |
| best_match_idx = similarities.argmax().item() | |
| best_score = similarities[best_match_idx].item() | |
| if best_score >= 0.65: | |
| # --- PATH 1: DATASET MATCH (Enhanced Formatting) --- | |
| original_answer = dataset_answers[best_match_idx] | |
| prompt = f"""You are the official University of Education (UOE) Assistant. | |
| I have found a verified answer in our database. Please rephrase it to make it extremely professional and user-friendly. | |
| STRICT GUIDELINES: | |
| 1. Use clear **Headings**. | |
| 2. Use **Bullet Points** for lists. | |
| 3. If there is data like timing, criteria, or contact info, present it in a **Markdown Table**. | |
| 4. Bold important keywords. | |
| 5. Maintain a polite and welcoming tone. | |
| User Question: {user_input} | |
| Verified Data: {original_answer} | |
| Enhanced Response:""" | |
| else: | |
| # --- PATH 2: GENERAL KNOWLEDGE (with Forwarding Note) --- | |
| manage_unmatched_queries(user_input) | |
| prompt = f"""You are the UOE AI Assistant for University of Education Lahore. | |
| The user asked: "{user_input}". This query is not in our verified database. | |
| Instructions: | |
| 1. Answer based on your general knowledge about UOE Lahore. | |
| 2. Format the response with headings and bold text. | |
| 3. At the end, you MUST add this exact text: | |
| "📢 *Note: Your query has been forwarded to our support team. We are currently updating our verified database to provide a verified answer next time.*" | |
| 4. List official contacts: Website: https://ue.edu.pk | Phone: +92-42-99262231-33""" | |
| return query_groq_llm(prompt) |