UoeChatbot / rag.py
midrees2806's picture
Update rag.py
bc54961 verified
import json
import glob
import os
import random
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
# Core AI Libraries
from sentence_transformers import SentenceTransformer, util
from groq import Groq
from datasets import load_dataset, Dataset
# Load environment variables
load_dotenv()
# Initialize Groq client
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
# Load similarity model
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# Config
HF_DATASET_REPO = "midrees2806/unmatched_queries"
HF_TOKEN = os.getenv("HF_TOKEN")
# Greeting list
GREETINGS = ["hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "aoa", "hi there", "hey there", "greetings"]
# Load multiple JSON datasets
dataset = []
try:
json_files = glob.glob('datasets/*.json')
for file_path in json_files:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
dataset.extend([item for item in data if isinstance(item, dict) and 'Question' in item and 'Answer' in item])
except Exception as e:
print(f"Error loading datasets: {e}")
# Precompute embeddings
dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
dataset_answers = [item.get("Answer", "") for item in dataset]
dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
def manage_unmatched_queries(query: str):
try:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
try:
ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
df = ds["train"].to_pandas()
except:
df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
if query not in df["Query"].values:
new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
updated_ds = Dataset.from_pandas(df)
updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
except Exception as e:
print(f"Failed to save query: {e}")
def query_groq_llm(prompt):
try:
# Llama-3.3-70b-versatile with streaming enabled
completion = groq_client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{
"role": "system",
"content": "You are the official UOE AI Assistant. Your goal is to provide highly structured, professional, and easy-to-read information about the University of Education Lahore."
},
{
"role": "user",
"content": prompt
}
],
temperature=1,
max_completion_tokens=1024,
top_p=1,
stream=True,
stop=None
)
full_response = ""
for chunk in completion:
content = chunk.choices[0].delta.content or ""
full_response += content
return full_response.strip()
except Exception as e:
print(f"Groq API Error: {e}")
return None
def get_best_answer(user_input):
if not user_input.strip():
return "Please enter a valid question."
user_input_lower = user_input.lower().strip()
# Greeting Handling
if any(greet == user_input_lower for greet in GREETINGS):
return "Hello! I am the UOE AI Assistant. How can I help you regarding admissions, fees, or programs today?"
# Length check
if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS):
return "Please provide more details. I need at least 3 words to understand your query properly."
# # Direct Fee Link
# if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "semester fees", "semester fee"]):
# return (
# "💰 **Official Fee Structure**\n\n"
# "For the most accurate and up-to-date fee details, please visit the official University of Education link:\n"
# "🔗 https://ue.edu.pk/allfeestructure.php"
# )
# Similarity Calculation
user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
best_match_idx = similarities.argmax().item()
best_score = similarities[best_match_idx].item()
if best_score >= 0.65:
# --- PATH 1: DATASET MATCH (Enhanced Formatting) ---
original_answer = dataset_answers[best_match_idx]
prompt = f"""You are the official University of Education (UOE) Assistant.
I have found a verified answer in our database. Please rephrase it to make it extremely professional and user-friendly.
STRICT GUIDELINES:
1. Use clear **Headings**.
2. Use **Bullet Points** for lists.
3. If there is data like timing, criteria, or contact info, present it in a **Markdown Table**.
4. Bold important keywords.
5. Maintain a polite and welcoming tone.
User Question: {user_input}
Verified Data: {original_answer}
Enhanced Response:"""
else:
# --- PATH 2: GENERAL KNOWLEDGE (with Forwarding Note) ---
manage_unmatched_queries(user_input)
prompt = f"""You are the UOE AI Assistant for University of Education Lahore.
The user asked: "{user_input}". This query is not in our verified database.
Instructions:
1. Answer based on your general knowledge about UOE Lahore.
2. Format the response with headings and bold text.
3. At the end, you MUST add this exact text:
"📢 *Note: Your query has been forwarded to our support team. We are currently updating our verified database to provide a verified answer next time.*"
4. List official contacts: Website: https://ue.edu.pk | Phone: +92-42-99262231-33"""
return query_groq_llm(prompt)