Spaces:

GenArabia
/

Qrail-AI

Sleeping

App Files Files Community

Eslam Magdy commited on Jun 5, 2025

Commit

b567285

verified ·

1 Parent(s): 5e65deb

Upload utils.py

Browse files

Files changed (1) hide show

conversational/utils.py +548 -0

conversational/utils.py ADDED Viewed

	@@ -0,0 +1,548 @@

+from collections import defaultdict
+from json_repair import repair_json
+from rank_bm25 import BM25Okapi
+from openai import OpenAI
+from tqdm import tqdm
+import numpy as np
+import unicodedata
+import tiktoken
+import faiss
+import time
+import json
+import os
+import re
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+# <<<<< Client >>>>>
+OPENAI_API_KEY = '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'
+client = OpenAI(api_key=OPENAI_API_KEY)
+def generate_embeddings(text, model="text-embedding-3-small"): # model = "deployment_name"
+    return client.embeddings.create(input = [text], model=model).data[0].embedding
+enc = tiktoken.get_encoding("o200k_base")
+assert enc.decode(enc.encode("hello world")) == "hello world"
+enc = tiktoken.encoding_for_model("gpt-4o")
+# <<<<< Initials >>>>>
+# Load All Jsons
+folder_path = "conversational/Json_contracts"
+json_list = []
+for filename in sorted(os.listdir(folder_path)):
+    if filename.endswith(".json"):
+        full_path = os.path.join(folder_path, filename)
+        with open(full_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            json_list.append(data)
+print(f"✅ Loaded {len(json_list)} contracts.")
+def fetch_json(contract_index: int, item_index: int) -> dict | None:
+    try:
+        return json_list[contract_index][item_index]
+    except (IndexError, TypeError):
+        return None
+def build_vector_of_faiss_indices_from_folder(folder_path):
+    faiss_indices = []
+    file_names = []
+    for file in sorted(os.listdir(folder_path)):
+        if file.endswith(".npy"):
+            file_path = os.path.join(folder_path, file)
+            embeddings = np.load(file_path).astype(np.float32)
+            # embeddings = np.load(file_path, allow_pickle=False).astype(np.float32)
+            faiss.normalize_L2(embeddings)
+            dim = embeddings.shape[1]
+            index = faiss.IndexFlatIP(dim)
+            index.add(embeddings)
+            faiss_indices.append(index)
+            file_names.append(file)
+    return faiss_indices, file_names
+def normalize_text(text: str) -> str:
+    if not text:
+        return ""
+    # 1. Unicode normalization (standard form)
+    text = unicodedata.normalize("NFKC", text)
+    # 2. Remove invisible control characters (except tabs)
+    text = re.sub(r'[\u200b-\u200f\u202a-\u202e\u2060-\u206f]', '', text)
+    # 3. Replace line/paragraph breaks and unicode separators with space
+    text = re.sub(r'[\r\n\u2028\u2029]+', ' ', text)
+    # 4. Collapse multiple spaces and tabs
+    text = re.sub(r'\s+', ' ', text)
+    # 5. Lowercase (optional, for BM25 or standard IR)
+    text = text.lower()
+    # 6. normalize to singular
+    # 7. Strip leading/trailing space
+    return text.strip()
+def s_stripper(sent):
+    words = sent.split()
+    processed = []
+    for word in words:
+        if len(word) >= 3 and word.endswith('s'):
+            processed.append(word[:-1])
+        else:
+            processed.append(word)
+    return ' '.join(processed)
+def tokenize(text):
+    text=s_stripper(text)
+    return text.lower().split()
+BM25_vectors = []
+for contract_json in tqdm(json_list, desc="Normalizing texts"):
+    docs = [normalize_text(item["text"]) for item in contract_json if item.get("text", "").strip()]
+    tokenized_docs = [tokenize(doc) for doc in docs]
+    bm25_index = BM25Okapi(tokenized_docs)
+    BM25_vectors.append(bm25_index)
+def check_json(input_string: str) -> bool:
+    return "json" in input_string.lower()
+embedding_path="conversational/ada3_embeddings"
+vector_of_indices,f_names = build_vector_of_faiss_indices_from_folder(embedding_path)
+contract_code_names = [
+    "PMC_A_Jacobs",                # 0
+    "PMC_B_Hill",                  # 1
+    "PMC_C_Louis Berger", # 2
+    "DB_Red_Line_North_UG",        # 3
+    "DB_Gold_Line_UG",             # 4
+    "DB_Green_Line_UG",            # 5
+    "DB_Red_Line_South_Elevated",  # 6
+    "DB_Green_Line_Elevated"       # 7
+]
+def Get_Context(final_indices: list[dict]) -> str:
+    contract_names = [contract_code_names[item["contract_index"]] for item in final_indices]
+    cxt = f"Number of contracts: {len(final_indices)}\nContract-names: {contract_names}\n"
+    for contract in final_indices:
+        i = contract["contract_index"]
+        page_indices = contract["page_indices"]
+        cxt += "\n#####\n"
+        meta_data = fetch_json(i, page_indices[0])  # Use the first page to get contract metadata
+        cxt += "contract_name: " + meta_data["contract_name"] + "\n"
+        for pos in page_indices:
+            page = fetch_json(i, pos)
+            cxt += (
+                "file_name: " + page["file_name"] + "\n" +
+                "path: " + page["path"] + "\n" +
+                "Page Number: " + str(page["page"]) + "  " + page["text"] + "\n\n"
+            )
+    return cxt
+def Get_Faiss_indices(
+    query: str,
+    contract_index: list[int],
+    vector_of_indices: list[faiss.IndexFlatIP],
+    K: int
+) -> list[dict]:
+    vquery = np.array(generate_embeddings(query)).reshape(1, -1).astype('float32')
+    faiss.normalize_L2(vquery)
+    json_index = []
+    for i in contract_index:
+        index = vector_of_indices[i]
+        D, I = index.search(vquery, K)
+        json_index.append({"contract_index":i, "page_indices": I[0]})
+    return json_index
+def Get_BM25_indices(
+    query: str,
+    contract_index: list[int],
+    bm25_vectors: list,
+    K: int
+) -> list[dict]:
+    def tokenize(text):
+        return text.lower().split()
+    tokens = tokenize(query)
+    json_index=[]
+    for i in contract_index:
+        bm25 = bm25_vectors[i]
+        json_data = json_list[i]
+        scores = bm25.get_scores(tokens)
+        top_indices = np.argsort(scores)[::-1][:K]
+        json_index.append({"contract_index":i, "page_indices": top_indices})
+    return json_index
+def merge_contracts_extended(obj1, obj2):
+    merged = defaultdict(set)
+    def expand_indices(indices):
+        # For each page, include page-1, page, page+1
+        expanded = set()
+        for p in indices:
+            expanded.update([p - 1, p, p + 1])
+        return expanded
+    # Add pages from obj1
+    for entry in obj1:
+        idx = entry['contract_index']
+        merged[idx].update(expand_indices(entry['page_indices']))
+    # Add pages from obj2
+    for entry in obj2:
+        idx = entry['contract_index']
+        merged[idx].update(expand_indices(entry['page_indices']))
+    # Convert sets to sorted lists
+    return [{'contract_index': idx, 'page_indices': sorted(pages)} for idx, pages in merged.items()]
+def reciprocal_rank_fusion(bm25_indices, faiss_indices, Top_K=10, k=60):
+    rrf_scores = defaultdict(float)
+    def add_scores(source):
+        for contract in source:
+            contract_index = contract['contract_index']
+            pages = contract['page_indices']
+            for rank, page_index in enumerate(pages):
+                key = (contract_index, page_index)
+                rrf_scores[key] += 1 / (k + rank)
+    add_scores(bm25_indices)
+    add_scores(faiss_indices)
+    contract_pages = defaultdict(list)
+    for (contract_index, page_index), score in rrf_scores.items():
+        contract_pages[contract_index].append((page_index, score))
+    output = []
+    for contract_index, pages in contract_pages.items():
+        sorted_pages = sorted(pages, key=lambda x: x[1], reverse=True)[:Top_K]
+        page_indices = np.array([p[0] for p in sorted_pages], dtype=np.int64)
+        output.append({'contract_index': contract_index, 'page_indices': page_indices})
+    return output
+def chat_gpt_Agentic_RAG(messages):
+    JSON_FLAG = messages.contracts
+    history = [{"role": m.role, "content": m.content} for m in messages.messages]
+    original_message= history[0]['content']
+    user_message = history[-1]["content"]
+    print("Histppry ", history)
+    print("Origina MSG ", original_message)
+    if not JSON_FLAG:
+        SYS_PROMPT = SYS_QRAIL_O4_plus
+    else:
+        SYS_PROMPT = f"""You are a helpful assistant that answers questions based on the provided context.
+        If you don't have enough information, ask for more details.\n context : {cxt}"""
+    history_openai_format = []
+    history_openai_format.append({"role": "system", "content": SYS_PROMPT})
+    history_openai_format.extend(history)
+    history_openai_format.append({"role": "user", "content": "Query :" + user_message})
+    response = call_gpt(history_openai_format)
+    json_response = response
+    if check_json(response) and not JSON_FLAG:
+        json_result=repair_json(response)
+        json_result=json.loads(json_result)
+        key_intent=call_gpt_intent(s_stripper(original_message))
+        n_contracts=len(json_result["contract_names"])
+        responses = []
+        for nc in range(n_contracts):
+            faiss_indices=Get_Faiss_indices(key_intent,[json_result["contract_indices"][nc]],vector_of_indices,5)
+            BM25_indices=Get_BM25_indices(key_intent,[json_result["contract_indices"][nc]],BM25_vectors,10)
+            final_indices = merge_contracts_extended(BM25_indices,faiss_indices)
+            cxt=Get_Context(final_indices)
+            # Total_tokens=count_tokens(cxt)
+            # response_agent = call_Context_Answer_per_contract(original_message, cxt)
+            async def event_stream():
+                response_agent = ""
+                for chunk in call_Context_Answer_per_contract(original_message, cxt):
+                    await asyncio.sleep(0.08)
+                    response_agent += chunk
+                    yield json.dumps({"type": "stream", "data": {"ai_message":  response_agent   }}) + "\n"
+                responses.append(response_agent)
+        response = "\n\n".join(responses)
+    return response, json_response
+# <<<<< GPTs >>>>>
+def call_gpt(message_text):
+    completion = client.chat.completions.create(
+        model="gpt-4.1-mini",
+        # model="gpt-4o",
+        messages=message_text,
+        temperature=0.0,
+        max_tokens=1000,
+        top_p=0.95,
+        frequency_penalty=0,
+        presence_penalty=0,
+        stop=None,
+    )
+    return completion.choices[0].message.content
+def call_gpt_intent(query):
+    SYS_Parse = """You are a simple keyword extraction assistant.
+    Given a query your task is to just strip and remove all the stop words, interrogative words punctuations, and leave the rest
+    All queries are related to Qatar Rail Project so **stop words** will include also irrelevant and redundant words
+    such as , UG , Underground , elevated , Gold line , Red line , Green line , Qatar Rail , Qatar Rail Project,
+    PMC (Project Management Consultant),..such terms will confuse the search and should be removed.
+    """
+    message_text=[
+      {
+        "role": "system",
+        "content": SYS_Parse
+    },
+      {
+        "role": "user",
+        "content": query
+      },
+    ]
+    completion = client.chat.completions.create(
+    model="gpt-4.1-mini",
+    messages = message_text,
+    temperature=0.0,
+    max_tokens=200,
+    top_p=0.95,
+    frequency_penalty=0,
+    presence_penalty=0,
+    stop=None
+    )
+    return completion.choices[0].message.content
+def call_Context_Answer(query, context):
+    SYS_CONTRACT_SEL="""You are “Qatar Rail AI Assistant,” a friendly and smart
+  assistant that helps users find information in Qatar Rail contracts. You will be prvided with a context and a question
+  The context will contain information about one or more contracts.
+  The question will be a natural language question about the context.
+  Your task is to answer the question using the context provided.
+  Do not answer the question using your own knowledge.
+  **Output Format**:
+  - nicely formatted markdown text
+  - Use the contract names as headers for the sections of the answer
+  - Use bullet points to list the information
+  - Use bold text to highlight important information
+  - Provide a brief summary of the answer at the end if it's a single contract
+  - Provide a comparative table if it's multiple contracts
+  - add references to the files and page numbers in the context where the information was found.
+  """
+    message_text=[
+      {
+        "role": "system",
+        "content": SYS_CONTRACT_SEL
+    },
+      {
+        "role": "user",
+        "content": f"Query {query} \n Context {context}"
+      },
+    ]
+    completion = client.chat.completions.create(
+    model="gpt-4.1-mini",
+    messages = message_text,
+    temperature=0.0,
+    max_tokens=3500,
+    top_p=0.95,
+    frequency_penalty=0,
+    presence_penalty=0,
+    stop=None
+    )
+    return completion.choices[0].message.content
+def call_Context_Answer_per_contract(query, context):
+    SYS_CONTRACT_SEL="""You are “Qatar Rail AI Assistant,” a friendly and smart
+  assistant that helps users find information in Qatar Rail contracts. You will be provided with a context and a question about
+  a single contract.
+  The question will be a natural language question about the context.
+  Your task is to answer the question using the context provided.
+  Do not answer the question using your own knowledge.unless only you were asked to provide a template notice
+  depending on the query intent.
+  If no clear answer can be found in the context, mention that the answer is not available.
+  **Output Format**:
+  - nicely formatted markdown text
+  - Use the contract names as headers with Bold for the sections of the answer
+  - Use bullet points to list the information
+  - Use bold text to highlight important information
+  - add references in bullets for , where  the information was found in context
+  -- filenames
+  -- File Paths
+  -- page numbers
+  """
+    message_text=[
+      {
+        "role": "system",
+        "content": SYS_CONTRACT_SEL
+    },
+      {
+        "role": "user",
+        "content": f"Query {query} \n Context {context}"
+      },
+    ]
+    completion = client.chat.completions.create(
+    model="gpt-4o-mini",
+    messages = message_text,
+    temperature=0.0,
+    max_tokens=3500,
+    top_p=0.95,
+    frequency_penalty=0,
+    presence_penalty=0,
+    stop=None,
+    stream=True
+    )
+    for chunk in completion:
+        delta = chunk.choices[0].delta
+        if delta.content is not None:
+            yield delta.content
+    # return completion.choices[0].message.content
+# <<<<< SYS_PROMPT >>>>>
+SYS_QRAIL_O4_plus="""You are “Qatar Rail AI Assistant,” a friendly and smart assistant that helps users find information
+in Qatar Rail contracts. Use conversational language, ask brief clarifying questions when needed,
+and only emit your JSON once you’re sure of the user’s intent.
+Background information:
+1. Know your universe of contracts: indices, names and and their descriptions:
+   • 0,**PMC_A_Jacobs** – Project management consulting services by Jacobs Consulting
+   • 1,**PMC_B_Hill**   – Project management consulting services by Hill International
+     2 **PMC_C_Louis Berger Egis Rail JV
+   • 3,**DB_Red_Line_North_UG**      – Design-Build Construction for the Red Line North (underground)
+   • 4,**DB_Gold_Line_UG**           – Design-Build Construction for the Gold Line  (underground)
+     5, **DB_Green_Line_UG**           – Design-Build Construction for the Green Line (underground)
+   • 6,**DB_Red_Line_South_Elevated**      – Design-Build Construction for the Red Line South (Elevated)
+   • 7,**DB_Green_Line_Elevated** – Design-Build Construction for the Green Line (Elevated)
+   **PMC Contracts information**:
+   PMC contracts define the core legal framework between the client (e.g., a government or transportation authority) and
+   the appointed project management consultant. These agreements govern how consultants supervise project progress,
+   ensure quality control, manage risks, and act on behalf of the client during project execution.
+    They are not directly involved in construction or design, but in ensuring that those activities are executed per plan and standards.
+   **DB Contracts information**:
+    The DB contracts form the backbone of metro infrastructure delivery, comprising detailed and voluminous documentation across all project phases
+    — from planning, design, and tendering, to construction and reporting. They include:
+    Design requirements and standards
+    Contractual volumes and conditions
+    Site investigations and reports
+    provisional sums
+    Correspondence during tender and execution
+    These contracts cover end-to-end execution responsibilities including design, construction, and sometimes commissioning,
+    reflecting a turnkey model typical in large infrastructure works.
+2. At each user turn:
+   - You should first identify the contract type (PMC or DB) if its a PMC list to the user the 3 PMC contracts and ask
+     him to choose one of them.
+     - use the above contracts information to guess the target of the query as either PMC and DB contracts
+     - provide this guess to the user as a hint by saying "your query seems to be related to {PMC or DB} contracts"
+     if its a DB contract list to the user the 5 DB contracts and ask him to choose one or more of them.
+   a. Try to determine if the user means:
+      – A single contract
+      – Multiple contracts
+   b. If you’re confident, respond immediately with **only** the JSON:
+      ```json
+      {
+        "contract_names": [ /* one or more identifiers */ ],
+        "contract_indices": [ /* their index number according to the list / ]
+      }
+      ```
+   c. If you’re not yet sure, ask **one** concise follow-up, using descriptions where helpful. Examples:
+      – “Just to confirm, are you looking for the project-management service by Jacobs or by Hill?”
+      – “Do you want details on the Red Line North or Red Line South construction?”
+      – “Would you like information on all of the DB construction contracts or a specific line?”
+3. Once you’ve asked a clarification, wait for the user’s reply. Don’t ask any more questions unless it’s still ambiguous.
+4. Keep your language natural and polite. You should feel like a helpful assistant, not a quizmaster.
+—
+Start now.
+"""