midrees2806 commited on
Commit
63eae27
·
verified ·
1 Parent(s): 238cf2b

Delete rag.py

Browse files
Files changed (1) hide show
  1. rag.py +0 -144
rag.py DELETED
@@ -1,144 +0,0 @@
1
- import json
2
- import glob
3
- import os
4
- import pandas as pd
5
- import numpy as np
6
- from datetime import datetime
7
- from dotenv import load_dotenv
8
-
9
- # Core AI Libraries
10
- from sentence_transformers import SentenceTransformer, util
11
- from groq import Groq
12
- from datasets import load_dataset, Dataset
13
-
14
- # Image/UI Utils (Optional based on your imports)
15
- from PIL import Image, ImageDraw, ImageFont
16
- from io import BytesIO
17
-
18
- # 1. INITIALIZATION & CONFIG
19
- load_dotenv()
20
- groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
21
- similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
22
-
23
- HF_DATASET_REPO = "midrees2806/unmatched_queries"
24
- HF_TOKEN = os.getenv("HF_TOKEN")
25
-
26
- # 2. DATA LOADING
27
- dataset = []
28
- try:
29
- # Get all json files from the datasets folder
30
- json_files = glob.glob('datasets/*.json')
31
- for file_path in json_files:
32
- with open(file_path, 'r', encoding='utf-8') as f:
33
- data = json.load(f)
34
- if isinstance(data, list):
35
- for item in data:
36
- if isinstance(item, dict) and 'Question' in item and 'Answer' in item:
37
- dataset.append(item)
38
- else:
39
- print(f"Skipping {file_path}: Expected a list of dictionaries.")
40
- except Exception as e:
41
- print(f"Error loading datasets: {e}")
42
-
43
- # Precompute embeddings for faster search
44
- dataset_questions = [item.get("Question", "").lower().strip() for item in dataset]
45
- dataset_answers = [item.get("Answer", "") for item in dataset]
46
-
47
- # Convert dataset to tensors once at startup
48
- if dataset_questions:
49
- dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
50
- else:
51
- dataset_embeddings = None
52
- print("Warning: Dataset is empty!")
53
-
54
- # 3. UTILITY FUNCTIONS
55
- def manage_unmatched_queries(query: str):
56
- """Logs queries that didn't meet the similarity threshold to Hugging Face Hub."""
57
- try:
58
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
59
- try:
60
- ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
61
- df = ds["train"].to_pandas()
62
- except Exception:
63
- df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
64
-
65
- if query not in df["Query"].values:
66
- new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
67
- df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
68
- updated_ds = Dataset.from_pandas(df)
69
- updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
70
- except Exception as e:
71
- print(f"Failed to save unmatched query: {e}")
72
-
73
- def query_groq_llm(prompt, system_message="You are an official assistant for the University of Education Lahore."):
74
- """Sends a prompt to Groq Llama 3 and returns the response."""
75
- try:
76
- chat_completion = groq_client.chat.completions.create(
77
- messages=[
78
- {"role": "system", "content": system_message},
79
- {"role": "user", "content": prompt}
80
- ],
81
- model="llama3-70b-8192",
82
- temperature=0.6, # Lower temperature for more factual responses
83
- max_tokens=800
84
- )
85
- return chat_completion.choices[0].message.content.strip()
86
- except Exception as e:
87
- print(f"Error querying Groq: {e}")
88
- return None
89
-
90
- # 4. MAIN RAG LOGIC
91
- def get_best_answer(user_input):
92
- # Basic Validation
93
- if not user_input.strip():
94
- return "Please enter a valid question."
95
-
96
- user_input_lower = user_input.lower().strip()
97
- if len(user_input_lower.split()) < 3:
98
- return "Please ask your question with more detail (at least 3 words)."
99
-
100
- # Special Case: Fee Structure (Direct Link)
101
- if any(kw in user_input_lower for kw in ["fee structure", "fees structure", "fee list"]):
102
- return (
103
- "💰 For complete and up-to-date fee details, please visit the official page:\n"
104
- "🔗 https://ue.edu.pk/allfeestructure.php"
105
- )
106
-
107
- # Calculate Similarity
108
- if dataset_embeddings is None:
109
- return "System is currently updating. Please try again in a moment."
110
-
111
- user_embedding = similarity_model.encode(user_input_lower, convert_to_tensor=True)
112
- similarities = util.pytorch_cos_sim(user_embedding, dataset_embeddings)[0]
113
- best_match_idx = similarities.argmax().item()
114
- best_score = similarities[best_match_idx].item()
115
-
116
- # DECISION BRIDGE
117
- if best_score >= 0.65:
118
- # PATH 1: Verified Data Found
119
- original_answer = dataset_answers[best_match_idx]
120
- prompt = (
121
- f"A student asked: '{user_input}'.\n"
122
- f"Based on our official records: '{original_answer}'.\n"
123
- "Please rewrite this into a friendly, professional response."
124
- )
125
- system_role = "You are an official University assistant. Use ONLY the provided records."
126
- else:
127
- # PATH 2: No Match - Fallback to General Knowledge & Log
128
- manage_unmatched_queries(user_input)
129
- prompt = (
130
- f"A student asked: '{user_input}'.\n"
131
- "I don't have a specific record for this. Provide a general helpful response "
132
- "based on university standards. If you are unsure about specific dates or costs, "
133
- "direct them to contact info@ue.edu.pk."
134
- )
135
- system_role = "You are a helpful University assistant. Direct unknown specific queries to official channels."
136
-
137
- # Final Generation
138
- llm_response = query_groq_llm(prompt, system_message=system_role)
139
-
140
- # Final Fallback in case API fails
141
- if not llm_response:
142
- return original_answer if best_score >= 0.65 else "Please contact the university at info@ue.edu.pk for assistance."
143
-
144
- return llm_response