File size: 6,116 Bytes
31bbe51
1756bf4
729f39a
1756bf4
729f39a
1756bf4
729f39a
 
1756bf4
 
 
 
31bbe51
 
 
 
 
 
 
729f39a
31bbe51
 
 
 
 
 
729f39a
9765f7f
729f39a
 
728639e
 
 
 
 
 
 
9765f7f
728639e
 
31bbe51
 
 
 
729f39a
 
31bbe51
 
 
 
 
 
 
 
1756bf4
31bbe51
 
 
 
 
 
 
 
1756bf4
31bbe51
bc54961
58dc6f1
 
 
 
 
bc54961
58dc6f1
 
 
 
 
 
 
 
 
20ecec9
58dc6f1
31bbe51
9765f7f
58dc6f1
 
 
 
 
 
9765f7f
58dc6f1
 
 
31bbe51
 
 
 
729f39a
31bbe51
 
bc54961
20ecec9
bc54961
20ecec9
bc54961
729f39a
bc54961
47f4327
bc54961
 
 
 
 
 
 
31bbe51
bc54961
31bbe51
 
 
 
 
1756bf4
bc54961
1756bf4
20ecec9
bc54961
20ecec9
bc54961
 
 
 
 
 
20ecec9
 
 
bc54961
 
1756bf4
bc54961
171c15b
bc54961
 
47f4327
bc54961
 
 
 
 
 
a9e3d91
58dc6f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import json
import glob
import os
import random
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv

# Core AI Libraries
from sentence_transformers import SentenceTransformer, util
from groq import Groq
from datasets import load_dataset, Dataset

# Load environment variables
load_dotenv()

# Initialize Groq client
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Load similarity model
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Config
HF_DATASET_REPO = "midrees2806/unmatched_queries"
HF_TOKEN = os.getenv("HF_TOKEN")

# Greeting list
GREETINGS = ["hi", "hello", "hey", "good morning", "good afternoon", "good evening", "assalam o alaikum", "salam", "aoa", "hi there", "hey there", "greetings"]

# Load multiple JSON datasets
dataset = []
try:
    json_files = glob.glob('datasets/*.json')
    for file_path in json_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                dataset.extend([item for item in data if isinstance(item, dict) and 'Question' in item and 'Answer' in item])
except Exception as e:
    print(f"Error loading datasets: {e}")

# Precompute embeddings
dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
dataset_answers = [item.get("Answer", "") for item in dataset]
dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)

def manage_unmatched_queries(query: str):
    try:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        try:
            ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
            df = ds["train"].to_pandas()
        except:
            df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
        
        if query not in df["Query"].values:
            new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
            df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
            updated_ds = Dataset.from_pandas(df)
            updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
    except Exception as e:
        print(f"Failed to save query: {e}")

def query_groq_llm(prompt):
    try:
        # Llama-3.3-70b-versatile with streaming enabled
        completion = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {
                    "role": "system", 
                    "content": "You are the official UOE AI Assistant. Your goal is to provide highly structured, professional, and easy-to-read information about the University of Education Lahore."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            temperature=1,
            max_completion_tokens=1024,
            top_p=1,
            stream=True,
            stop=None
        )

        full_response = ""
        for chunk in completion:
            content = chunk.choices[0].delta.content or ""
            full_response += content
            
        return full_response.strip()

    except Exception as e:
        print(f"Groq API Error: {e}")
        return None

def get_best_answer(user_input):
    if not user_input.strip():
        return "Please enter a valid question."

    user_input_lower = user_input.lower().strip()

    # Greeting Handling
    if any(greet == user_input_lower for greet in GREETINGS):
        return "Hello! I am the UOE AI Assistant. How can I help you regarding admissions, fees, or programs today?"

    # Length check
    if len(user_input_lower.split()) < 3 and not any(greet in user_input_lower for greet in GREETINGS):
        return "Please provide more details. I need at least 3 words to understand your query properly."

    # # Direct Fee Link
    # if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "semester fees", "semester fee"]):
    #     return (
    #         "💰 **Official Fee Structure**\n\n"
    #         "For the most accurate and up-to-date fee details, please visit the official University of Education link:\n"
    #         "🔗 https://ue.edu.pk/allfeestructure.php"
    #     )

    # Similarity Calculation
    user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
    best_match_idx = similarities.argmax().item()
    best_score = similarities[best_match_idx].item()

    if best_score >= 0.65:
        # --- PATH 1: DATASET MATCH (Enhanced Formatting) ---
        original_answer = dataset_answers[best_match_idx]
        prompt = f"""You are the official University of Education (UOE) Assistant. 
        I have found a verified answer in our database. Please rephrase it to make it extremely professional and user-friendly.

        STRICT GUIDELINES:
        1. Use clear **Headings**.
        2. Use **Bullet Points** for lists.
        3. If there is data like timing, criteria, or contact info, present it in a **Markdown Table**.
        4. Bold important keywords.
        5. Maintain a polite and welcoming tone.

        User Question: {user_input}
        Verified Data: {original_answer}

        Enhanced Response:"""
    else:
        # --- PATH 2: GENERAL KNOWLEDGE (with Forwarding Note) ---
        manage_unmatched_queries(user_input)
        prompt = f"""You are the UOE AI Assistant for University of Education Lahore. 
        The user asked: "{user_input}". This query is not in our verified database.

        Instructions:
        1. Answer based on your general knowledge about UOE Lahore.
        2. Format the response with headings and bold text.
        3. At the end, you MUST add this exact text:
           "📢 *Note: Your query has been forwarded to our support team. We are currently updating our verified database to provide a verified answer next time.*"
        4. List official contacts: Website: https://ue.edu.pk | Phone: +92-42-99262231-33"""

    return query_groq_llm(prompt)