|
|
from collections import defaultdict |
|
|
from json_repair import repair_json |
|
|
from rank_bm25 import BM25Okapi |
|
|
from openai import OpenAI |
|
|
from tqdm import tqdm |
|
|
import numpy as np |
|
|
import unicodedata |
|
|
import tiktoken |
|
|
import faiss |
|
|
import time |
|
|
import json |
|
|
import os |
|
|
import re |
|
|
|
|
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" |
|
|
|
|
|
|
|
|
OPENAI_API_KEY = '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' |
|
|
|
|
|
client = OpenAI(api_key=OPENAI_API_KEY) |
|
|
|
|
|
def generate_embeddings(text, model="text-embedding-3-small"): |
|
|
return client.embeddings.create(input = [text], model=model).data[0].embedding |
|
|
|
|
|
enc = tiktoken.get_encoding("o200k_base") |
|
|
assert enc.decode(enc.encode("hello world")) == "hello world" |
|
|
enc = tiktoken.encoding_for_model("gpt-4o") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
folder_path = "conversational/Json_contracts" |
|
|
json_list = [] |
|
|
|
|
|
for filename in sorted(os.listdir(folder_path)): |
|
|
if filename.endswith(".json"): |
|
|
full_path = os.path.join(folder_path, filename) |
|
|
with open(full_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
json_list.append(data) |
|
|
|
|
|
print(f"✅ Loaded {len(json_list)} contracts.") |
|
|
|
|
|
def fetch_json(contract_index: int, item_index: int) -> dict | None: |
|
|
try: |
|
|
return json_list[contract_index][item_index] |
|
|
except (IndexError, TypeError): |
|
|
return None |
|
|
|
|
|
|
|
|
def build_vector_of_faiss_indices_from_folder(folder_path): |
|
|
|
|
|
faiss_indices = [] |
|
|
file_names = [] |
|
|
|
|
|
for file in sorted(os.listdir(folder_path)): |
|
|
if file.endswith(".npy"): |
|
|
file_path = os.path.join(folder_path, file) |
|
|
embeddings = np.load(file_path).astype(np.float32) |
|
|
|
|
|
|
|
|
faiss.normalize_L2(embeddings) |
|
|
|
|
|
dim = embeddings.shape[1] |
|
|
index = faiss.IndexFlatIP(dim) |
|
|
index.add(embeddings) |
|
|
|
|
|
faiss_indices.append(index) |
|
|
file_names.append(file) |
|
|
|
|
|
return faiss_indices, file_names |
|
|
|
|
|
|
|
|
def normalize_text(text: str) -> str: |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = unicodedata.normalize("NFKC", text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[\u200b-\u200f\u202a-\u202e\u2060-\u206f]', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[\r\n\u2028\u2029]+', ' ', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
|
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def s_stripper(sent): |
|
|
words = sent.split() |
|
|
processed = [] |
|
|
|
|
|
for word in words: |
|
|
if len(word) >= 3 and word.endswith('s'): |
|
|
processed.append(word[:-1]) |
|
|
else: |
|
|
processed.append(word) |
|
|
|
|
|
return ' '.join(processed) |
|
|
|
|
|
|
|
|
def tokenize(text): |
|
|
text=s_stripper(text) |
|
|
return text.lower().split() |
|
|
|
|
|
BM25_vectors = [] |
|
|
|
|
|
for contract_json in tqdm(json_list, desc="Normalizing texts"): |
|
|
|
|
|
docs = [normalize_text(item["text"]) for item in contract_json if item.get("text", "").strip()] |
|
|
tokenized_docs = [tokenize(doc) for doc in docs] |
|
|
|
|
|
bm25_index = BM25Okapi(tokenized_docs) |
|
|
BM25_vectors.append(bm25_index) |
|
|
|
|
|
def check_json(input_string: str) -> bool: |
|
|
return "json" in input_string.lower() |
|
|
|
|
|
|
|
|
embedding_path="conversational/ada3_embeddings" |
|
|
|
|
|
vector_of_indices,f_names = build_vector_of_faiss_indices_from_folder(embedding_path) |
|
|
|
|
|
contract_code_names = [ |
|
|
"PMC_A_Jacobs", |
|
|
"PMC_B_Hill", |
|
|
"PMC_C_Louis Berger", |
|
|
"DB_Red_Line_North_UG", |
|
|
"DB_Gold_Line_UG", |
|
|
"DB_Green_Line_UG", |
|
|
"DB_Red_Line_South_Elevated", |
|
|
"DB_Green_Line_Elevated" |
|
|
] |
|
|
|
|
|
def Get_Context(final_indices: list[dict]) -> str: |
|
|
|
|
|
contract_names = [contract_code_names[item["contract_index"]] for item in final_indices] |
|
|
|
|
|
cxt = f"Number of contracts: {len(final_indices)}\nContract-names: {contract_names}\n" |
|
|
|
|
|
for contract in final_indices: |
|
|
i = contract["contract_index"] |
|
|
page_indices = contract["page_indices"] |
|
|
|
|
|
cxt += "\n#####\n" |
|
|
meta_data = fetch_json(i, page_indices[0]) |
|
|
cxt += "contract_name: " + meta_data["contract_name"] + "\n" |
|
|
|
|
|
for pos in page_indices: |
|
|
page = fetch_json(i, pos) |
|
|
cxt += ( |
|
|
"file_name: " + page["file_name"] + "\n" + |
|
|
"path: " + page["path"] + "\n" + |
|
|
"Page Number: " + str(page["page"]) + " " + page["text"] + "\n\n" |
|
|
) |
|
|
|
|
|
return cxt |
|
|
|
|
|
def Get_Faiss_indices( |
|
|
query: str, |
|
|
contract_index: list[int], |
|
|
vector_of_indices: list[faiss.IndexFlatIP], |
|
|
K: int |
|
|
) -> list[dict]: |
|
|
|
|
|
vquery = np.array(generate_embeddings(query)).reshape(1, -1).astype('float32') |
|
|
faiss.normalize_L2(vquery) |
|
|
|
|
|
json_index = [] |
|
|
for i in contract_index: |
|
|
index = vector_of_indices[i] |
|
|
D, I = index.search(vquery, K) |
|
|
json_index.append({"contract_index":i, "page_indices": I[0]}) |
|
|
|
|
|
|
|
|
return json_index |
|
|
|
|
|
def Get_BM25_indices( |
|
|
query: str, |
|
|
contract_index: list[int], |
|
|
bm25_vectors: list, |
|
|
K: int |
|
|
) -> list[dict]: |
|
|
|
|
|
def tokenize(text): |
|
|
return text.lower().split() |
|
|
|
|
|
tokens = tokenize(query) |
|
|
|
|
|
json_index=[] |
|
|
for i in contract_index: |
|
|
|
|
|
bm25 = bm25_vectors[i] |
|
|
json_data = json_list[i] |
|
|
scores = bm25.get_scores(tokens) |
|
|
top_indices = np.argsort(scores)[::-1][:K] |
|
|
|
|
|
json_index.append({"contract_index":i, "page_indices": top_indices}) |
|
|
|
|
|
return json_index |
|
|
|
|
|
def merge_contracts_extended(obj1, obj2): |
|
|
|
|
|
merged = defaultdict(set) |
|
|
|
|
|
def expand_indices(indices): |
|
|
|
|
|
expanded = set() |
|
|
for p in indices: |
|
|
expanded.update([p - 1, p, p + 1]) |
|
|
return expanded |
|
|
|
|
|
|
|
|
for entry in obj1: |
|
|
idx = entry['contract_index'] |
|
|
merged[idx].update(expand_indices(entry['page_indices'])) |
|
|
|
|
|
|
|
|
for entry in obj2: |
|
|
idx = entry['contract_index'] |
|
|
merged[idx].update(expand_indices(entry['page_indices'])) |
|
|
|
|
|
|
|
|
return [{'contract_index': idx, 'page_indices': sorted(pages)} for idx, pages in merged.items()] |
|
|
|
|
|
|
|
|
def reciprocal_rank_fusion(bm25_indices, faiss_indices, Top_K=10, k=60): |
|
|
|
|
|
rrf_scores = defaultdict(float) |
|
|
|
|
|
def add_scores(source): |
|
|
for contract in source: |
|
|
contract_index = contract['contract_index'] |
|
|
pages = contract['page_indices'] |
|
|
for rank, page_index in enumerate(pages): |
|
|
key = (contract_index, page_index) |
|
|
rrf_scores[key] += 1 / (k + rank) |
|
|
|
|
|
add_scores(bm25_indices) |
|
|
add_scores(faiss_indices) |
|
|
|
|
|
contract_pages = defaultdict(list) |
|
|
for (contract_index, page_index), score in rrf_scores.items(): |
|
|
contract_pages[contract_index].append((page_index, score)) |
|
|
|
|
|
output = [] |
|
|
for contract_index, pages in contract_pages.items(): |
|
|
sorted_pages = sorted(pages, key=lambda x: x[1], reverse=True)[:Top_K] |
|
|
page_indices = np.array([p[0] for p in sorted_pages], dtype=np.int64) |
|
|
output.append({'contract_index': contract_index, 'page_indices': page_indices}) |
|
|
|
|
|
return output |
|
|
|
|
|
def chat_gpt_Agentic_RAG(messages): |
|
|
|
|
|
JSON_FLAG = messages.contracts |
|
|
|
|
|
history = [{"role": m.role, "content": m.content} for m in messages.messages] |
|
|
|
|
|
original_message= history[0]['content'] |
|
|
|
|
|
user_message = history[-1]["content"] |
|
|
|
|
|
print("Histppry ", history) |
|
|
print("Origina MSG ", original_message) |
|
|
|
|
|
if not JSON_FLAG: |
|
|
|
|
|
SYS_PROMPT = SYS_QRAIL_O4_plus |
|
|
else: |
|
|
SYS_PROMPT = f"""You are a helpful assistant that answers questions based on the provided context. |
|
|
If you don't have enough information, ask for more details.\n context : {cxt}""" |
|
|
|
|
|
history_openai_format = [] |
|
|
|
|
|
history_openai_format.append({"role": "system", "content": SYS_PROMPT}) |
|
|
|
|
|
history_openai_format.extend(history) |
|
|
|
|
|
history_openai_format.append({"role": "user", "content": "Query :" + user_message}) |
|
|
|
|
|
response = call_gpt(history_openai_format) |
|
|
|
|
|
json_response = response |
|
|
|
|
|
if check_json(response) and not JSON_FLAG: |
|
|
|
|
|
json_result=repair_json(response) |
|
|
|
|
|
json_result=json.loads(json_result) |
|
|
|
|
|
key_intent=call_gpt_intent(s_stripper(original_message)) |
|
|
|
|
|
n_contracts=len(json_result["contract_names"]) |
|
|
|
|
|
responses = [] |
|
|
|
|
|
for nc in range(n_contracts): |
|
|
|
|
|
faiss_indices=Get_Faiss_indices(key_intent,[json_result["contract_indices"][nc]],vector_of_indices,5) |
|
|
|
|
|
BM25_indices=Get_BM25_indices(key_intent,[json_result["contract_indices"][nc]],BM25_vectors,10) |
|
|
|
|
|
final_indices = merge_contracts_extended(BM25_indices,faiss_indices) |
|
|
|
|
|
cxt=Get_Context(final_indices) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def event_stream(): |
|
|
response_agent = "" |
|
|
for chunk in call_Context_Answer_per_contract(original_message, cxt): |
|
|
await asyncio.sleep(0.08) |
|
|
response_agent += chunk |
|
|
|
|
|
yield json.dumps({"type": "stream", "data": {"ai_message": response_agent }}) + "\n" |
|
|
|
|
|
responses.append(response_agent) |
|
|
|
|
|
response = "\n\n".join(responses) |
|
|
|
|
|
|
|
|
|
|
|
return response, json_response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_gpt(message_text): |
|
|
completion = client.chat.completions.create( |
|
|
model="gpt-4.1-mini", |
|
|
|
|
|
messages=message_text, |
|
|
temperature=0.0, |
|
|
max_tokens=1000, |
|
|
top_p=0.95, |
|
|
frequency_penalty=0, |
|
|
presence_penalty=0, |
|
|
stop=None, |
|
|
) |
|
|
return completion.choices[0].message.content |
|
|
|
|
|
def call_gpt_intent(query): |
|
|
|
|
|
SYS_Parse = """You are a simple keyword extraction assistant. |
|
|
Given a query your task is to just strip and remove all the stop words, interrogative words punctuations, and leave the rest |
|
|
All queries are related to Qatar Rail Project so **stop words** will include also irrelevant and redundant words |
|
|
such as , UG , Underground , elevated , Gold line , Red line , Green line , Qatar Rail , Qatar Rail Project, |
|
|
PMC (Project Management Consultant),..such terms will confuse the search and should be removed. |
|
|
|
|
|
""" |
|
|
|
|
|
message_text=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": SYS_Parse |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": query |
|
|
}, |
|
|
|
|
|
] |
|
|
|
|
|
completion = client.chat.completions.create( |
|
|
model="gpt-4.1-mini", |
|
|
messages = message_text, |
|
|
temperature=0.0, |
|
|
max_tokens=200, |
|
|
top_p=0.95, |
|
|
frequency_penalty=0, |
|
|
presence_penalty=0, |
|
|
stop=None |
|
|
) |
|
|
return completion.choices[0].message.content |
|
|
|
|
|
def call_Context_Answer(query, context): |
|
|
|
|
|
SYS_CONTRACT_SEL="""You are “Qatar Rail AI Assistant,” a friendly and smart |
|
|
assistant that helps users find information in Qatar Rail contracts. You will be prvided with a context and a question |
|
|
The context will contain information about one or more contracts. |
|
|
The question will be a natural language question about the context. |
|
|
Your task is to answer the question using the context provided. |
|
|
Do not answer the question using your own knowledge. |
|
|
**Output Format**: |
|
|
- nicely formatted markdown text |
|
|
- Use the contract names as headers for the sections of the answer |
|
|
- Use bullet points to list the information |
|
|
- Use bold text to highlight important information |
|
|
- Provide a brief summary of the answer at the end if it's a single contract |
|
|
- Provide a comparative table if it's multiple contracts |
|
|
- add references to the files and page numbers in the context where the information was found. |
|
|
""" |
|
|
|
|
|
message_text=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": SYS_CONTRACT_SEL |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f"Query {query} \n Context {context}" |
|
|
}, |
|
|
|
|
|
] |
|
|
|
|
|
completion = client.chat.completions.create( |
|
|
model="gpt-4.1-mini", |
|
|
messages = message_text, |
|
|
temperature=0.0, |
|
|
max_tokens=3500, |
|
|
top_p=0.95, |
|
|
frequency_penalty=0, |
|
|
presence_penalty=0, |
|
|
stop=None |
|
|
) |
|
|
return completion.choices[0].message.content |
|
|
|
|
|
def call_Context_Answer_per_contract(query, context): |
|
|
|
|
|
SYS_CONTRACT_SEL="""You are “Qatar Rail AI Assistant,” a friendly and smart |
|
|
assistant that helps users find information in Qatar Rail contracts. You will be provided with a context and a question about |
|
|
a single contract. |
|
|
The question will be a natural language question about the context. |
|
|
Your task is to answer the question using the context provided. |
|
|
Do not answer the question using your own knowledge.unless only you were asked to provide a template notice |
|
|
depending on the query intent. |
|
|
If no clear answer can be found in the context, mention that the answer is not available. |
|
|
|
|
|
|
|
|
**Output Format**: |
|
|
- nicely formatted markdown text |
|
|
- Use the contract names as headers with Bold for the sections of the answer |
|
|
- Use bullet points to list the information |
|
|
- Use bold text to highlight important information |
|
|
- add references in bullets for , where the information was found in context |
|
|
-- filenames |
|
|
-- File Paths |
|
|
-- page numbers |
|
|
""" |
|
|
|
|
|
message_text=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": SYS_CONTRACT_SEL |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f"Query {query} \n Context {context}" |
|
|
}, |
|
|
|
|
|
] |
|
|
|
|
|
completion = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages = message_text, |
|
|
temperature=0.0, |
|
|
max_tokens=3500, |
|
|
top_p=0.95, |
|
|
frequency_penalty=0, |
|
|
presence_penalty=0, |
|
|
stop=None, |
|
|
stream=True |
|
|
) |
|
|
|
|
|
|
|
|
for chunk in completion: |
|
|
delta = chunk.choices[0].delta |
|
|
if delta.content is not None: |
|
|
yield delta.content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SYS_QRAIL_O4_plus="""You are “Qatar Rail AI Assistant,” a friendly and smart assistant that helps users find information |
|
|
in Qatar Rail contracts. Use conversational language, ask brief clarifying questions when needed, |
|
|
and only emit your JSON once you’re sure of the user’s intent. |
|
|
Background information: |
|
|
1. Know your universe of contracts: indices, names and and their descriptions: |
|
|
• 0,**PMC_A_Jacobs** – Project management consulting services by Jacobs Consulting |
|
|
• 1,**PMC_B_Hill** – Project management consulting services by Hill International |
|
|
2 **PMC_C_Louis Berger Egis Rail JV |
|
|
• 3,**DB_Red_Line_North_UG** – Design-Build Construction for the Red Line North (underground) |
|
|
• 4,**DB_Gold_Line_UG** – Design-Build Construction for the Gold Line (underground) |
|
|
5, **DB_Green_Line_UG** – Design-Build Construction for the Green Line (underground) |
|
|
• 6,**DB_Red_Line_South_Elevated** – Design-Build Construction for the Red Line South (Elevated) |
|
|
• 7,**DB_Green_Line_Elevated** – Design-Build Construction for the Green Line (Elevated) |
|
|
|
|
|
**PMC Contracts information**: |
|
|
PMC contracts define the core legal framework between the client (e.g., a government or transportation authority) and |
|
|
the appointed project management consultant. These agreements govern how consultants supervise project progress, |
|
|
ensure quality control, manage risks, and act on behalf of the client during project execution. |
|
|
They are not directly involved in construction or design, but in ensuring that those activities are executed per plan and standards. |
|
|
**DB Contracts information**: |
|
|
The DB contracts form the backbone of metro infrastructure delivery, comprising detailed and voluminous documentation across all project phases |
|
|
— from planning, design, and tendering, to construction and reporting. They include: |
|
|
Design requirements and standards |
|
|
Contractual volumes and conditions |
|
|
Site investigations and reports |
|
|
provisional sums |
|
|
Correspondence during tender and execution |
|
|
These contracts cover end-to-end execution responsibilities including design, construction, and sometimes commissioning, |
|
|
reflecting a turnkey model typical in large infrastructure works. |
|
|
|
|
|
2. At each user turn: |
|
|
- You should first identify the contract type (PMC or DB) if its a PMC list to the user the 3 PMC contracts and ask |
|
|
him to choose one of them. |
|
|
- use the above contracts information to guess the target of the query as either PMC and DB contracts |
|
|
- provide this guess to the user as a hint by saying "your query seems to be related to {PMC or DB} contracts" |
|
|
if its a DB contract list to the user the 5 DB contracts and ask him to choose one or more of them. |
|
|
a. Try to determine if the user means: |
|
|
– A single contract |
|
|
– Multiple contracts |
|
|
|
|
|
b. If you’re confident, respond immediately with **only** the JSON: |
|
|
```json |
|
|
{ |
|
|
|
|
|
"contract_names": [ /* one or more identifiers */ ], |
|
|
"contract_indices": [ /* their index number according to the list / ] |
|
|
} |
|
|
``` |
|
|
c. If you’re not yet sure, ask **one** concise follow-up, using descriptions where helpful. Examples: |
|
|
– “Just to confirm, are you looking for the project-management service by Jacobs or by Hill?” |
|
|
– “Do you want details on the Red Line North or Red Line South construction?” |
|
|
– “Would you like information on all of the DB construction contracts or a specific line?” |
|
|
|
|
|
3. Once you’ve asked a clarification, wait for the user’s reply. Don’t ask any more questions unless it’s still ambiguous. |
|
|
|
|
|
4. Keep your language natural and polite. You should feel like a helpful assistant, not a quizmaster. |
|
|
|
|
|
— |
|
|
Start now. |
|
|
|
|
|
""" |