Spaces:

UAEBot
/

LlamaIndex_Model

Paused

App Files Files Community

Eslam Magdy commited on Apr 20, 2024

Commit

362b592

verified ·

1 Parent(s): 8c81779

Create app.py

Browse files

Files changed (1) hide show

app.py +712 -0

app.py ADDED Viewed

	@@ -0,0 +1,712 @@

+from llama_index.core.response.notebook_utils import display_source_node
+from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core import VectorStoreIndex, ServiceContext
+from llama_index.core.node_parser import SimpleNodeParser
+from llama_index.llms.azure_openai import AzureOpenAI
+from llama_index.readers.file import PDFReader
+from llama_index.core.schema import IndexNode
+from llama_index.core import Document
+from langchain_core.messages import HumanMessage
+from langchain_openai import AzureChatOpenAI
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains import ConversationChain
+from langchain.memory import ConversationBufferWindowMemory
+from langchain.prompts import PromptTemplate
+from sentence_transformers import util
+from openai import AzureOpenAI
+from bs4 import BeautifulSoup
+import pyshorteners
+import gradio as gr
+import pandas as pd
+import numpy as np
+import warnings
+import pickle
+import string
+import json
+import time
+import ast
+import os
+import re
+client = AzureOpenAI(
+  azure_endpoint = "https://moj-ada3.openai.azure.com/",
+  api_key="9639718f1a7d478a9313d2b2aeb5dacc",
+  api_version="2024-02-15-preview"
+)
+df = pd.read_csv("/content/drive/MyDrive/MOJ/Legislations/Data.csv")
+warnings.filterwarnings("ignore")
+def extract_title(text):
+    if '-' in text:
+        return text.split('-')[-1].strip()
+    elif '–' in text:
+        return text.split('–')[-1].strip()
+    else:
+        return ""
+def remove_title(text):
+    if '-' in text:
+        return text.split('-')[0].strip()
+    elif '–' in text:
+        return text.split('–')[0].strip()
+    else:
+        return text
+def get_articles(i):
+  try:
+    result_df = pd.DataFrame(columns=['Header', 'Text','Comment'])
+    #html = df[df['Id'] == 35850]['HTML'][621]
+    html = df['HTML'][i]
+    soup = BeautifulSoup(html, 'html.parser')
+    divs = soup.find_all('div')
+    h_class = 'x__1575___1604___1605___1575___1583___1577_14'
+    x = 0
+    txt = ''
+    headers = ast.literal_eval(df['Subjects'][i])
+    for d in divs:
+        try:
+            if d.get('class') is None:
+                d_class = d.find('div').get('class')[0]
+                d_text  = d.find('div').text.replace('\n\n','  ').replace('\n',' ')
+            else:
+                d_class = d.get('class')[0]
+                d_text  = d.text.replace('\n\n','  ').replace('\n',' ')
+            if h_class not in d_class:
+                txt += " " +d_text
+            else:
+                if x == 0:
+                    result_df = pd.concat([result_df, pd.DataFrame({'Header': ['Desc'], 'Text': [txt]})], ignore_index=True)
+                    txt = ''
+                    x += 1
+                else:
+                    result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True)
+                    txt = ''
+                    x += 1
+        except:
+            pass
+    result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True)
+    divs_with_showfn = soup.find_all('div', id=lambda x: x and x.startswith('fn'))
+    for r in range (result_df.shape[0]):
+        article = result_df['Header'][r].split('-')[0].strip()
+        for n,d in enumerate(divs_with_showfn):
+            edit = d.text.replace('\n\n','  ').replace('\n',' ')
+            match = edit[:35]
+            if (article.replace("الأولى","الاولى") in match.replace("الأولى","الاولى")) and ("القديم" in match) :
+                #result_df['Text'][r] += "\n\n-تعديل-\n\n" + edit
+                result_df['Comment'][r] = edit
+    if divs_with_showfn:
+        firstindex = divs_with_showfn[0].text.replace('\n\n','  ').replace('\n',' ')
+        last_e = result_df.shape[0] -1
+        mada = result_df['Text'][last_e]
+        if firstindex in mada :
+            result_df['Text'][last_e] = (mada.split(firstindex)[0])
+    #result_df['Title'] = result_df['Header'].apply(extract_title)
+    #result_df['Header'] = result_df['Header'].apply(remove_title)
+    return result_df.reset_index(drop=True)
+  except:
+      pass
+with open('/content/drive/MyDrive/MOJ/Legislations/BaseIndex/ada_base_index_small.pkl', 'rb') as f:
+    base_index_ = pickle.load(f)
+azure_endpoint = "https://moj-ada3.openai.azure.com/"
+api_key="9639718f1a7d478a9313d2b2aeb5dacc"
+api_version="2024-02-15-preview"
+deployment = "gpt-35-turbo-16k"
+os.environ["AZURE_OPENAI_API_KEY"] = api_key
+os.environ["AZURE_OPENAI_ENDPOINT"] = azure_endpoint
+llm_chain = AzureChatOpenAI(
+    openai_api_version= api_version,
+    azure_deployment= deployment,
+)
+client = AzureOpenAI(
+  azure_endpoint = "https://moj-ada3.openai.azure.com/",
+  api_key="9639718f1a7d478a9313d2b2aeb5dacc",
+  api_version="2024-02-15-preview"
+)
+SYS_TEMPLATE = """
+The following is a friendly conversation between a human and an AI.
+AI must follow the Instructions below
+Instructions:
+- AI is an Arabic legal expert in the UAE.
+- AI shall always reply in Arabic.
+- AI shall never reply in English.
+- AI shall not repeat any questions or rephrase them.
+- AI shall ask a presise question if needed to determine the user's intent.
+- AI shall only ask a maximum of one question if needed to human and then determine his intent.
+- AI shall only reply to questions related to law subjects.
+- AI shall not answer or explain or give any advice to user questions.
+- AI MUST not provide any details ever from given information, only use it to determine the desired intent.
+- AI shall use the given information only to ask precise and short question to determine user intent.
+- AI shall determine the user desired intent with the minimum number of questions possible.
+- AI shall not ask the user again after the user confirms on any question.
+- AI shall decide user intent if the user's query contains enough details without asiking him any more questions.
+- AI shall decide which suits query better if user wants a general info or says give me anything.
+- AI's only purpose is to determine the intended topic from the user.
+- AI shall choose node with the best description matching with the human's intent.
+- AI shall always end the conversation with the returns below as long as the user question matches with given info.
+- if AI asks a question and human says he dosent know the spesific law or article then AI shall determine and end the conversation with the returns below.
+- if Human asks a question (Is it permissible (هل يجوز)) AI should find the best node that can answer the question with yes or no.
+- AI shall end the conversation when the user confirms his intent and return as mentioned below from node's metadata.
+- AI shall mention every detail the user wants in the userintent returns.
+- AI MUST include the five digits number in the returns.
+- AI shall never leave the ID in returns empty it should always be five digits.
+Returns:
+[
+ID: five didgits number ,
+Topic: ,
+userIntent :
+]
+Information:
+{}
+"""
+sys_prompt_intent = """
+The following is a friendly conversation between a human and an AI.
+AI must follow the Instructions below
+Instructions:
+- AI is an Arabic legal expert in the UAE.
+- AI shall always reply in Arabic.
+- AI shall never reply in English.
+- AI shall answer the human questions based on the content provided.
+- AI shall answer only from within the  Content  provided , and NOT from outside.
+- AI shall answer using the exact text in content and not improvise.
+- AI shall NOT improvise , or give any  advices nor explanation.
+- AI shall not provide any links to user and tell him to search in it, it should always provide the required info.
+- AI shall always answer to the user query in a professional and informative way inculding all the details.
+- ِAI shall answer every question asked in the conversation from human in a detailed way.
+- AI shall include in the answer the article number (رقم المادة)
+Content:
+{}
+"""
+punctuations = string.punctuation
+def generate_embeddings(text, model="ada3_small"):
+    return client.embeddings.create(input = [text], model=model).data[0].embedding
+base_retriever = base_index_.as_retriever(similarity_top_k=10)
+def query_df(query):
+  retrievals = base_retriever.retrieve(
+      query
+  )
+  related_texts = []
+  metadatas = []
+  info = ''
+  for i,r in enumerate(retrievals):
+    article_index = df[df['Id'] == int(r.metadata['ID'])].index[0]
+    article_df = get_articles(article_index)
+    article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index()
+    article_text = article_intended['Text'][0]
+    if len(article_text) > 800 :
+      related_txt = related_text(article_text, query, 800)[0]
+    else:
+      related_txt = article_text
+    meta = r.metadata
+    meta = {
+        'Description': meta['Description'],
+        'ID': meta['ID'],
+        #'Title': meta['Title']
+    }
+    info += f"Node Number {i+1} : {related_txt} -- Node MetaData : {meta}\n"
+  return info
+from llama_index.core.vector_stores.types import ExactMatchFilter, MetadataFilters
+def query_df_filtered(query,id):
+  filters = MetadataFilters(filters=[
+      ExactMatchFilter(
+          key="ID",
+          value=str(id)
+      )
+  ])
+  b_retriever = base_index_.as_retriever(similarity_top_k=3, filters=filters)
+  retrievals = b_retriever.retrieve(
+      query
+  )
+  related_texts = []
+  metadatas = []
+  info_filtered = ''
+  for i,r in enumerate(retrievals):
+    article_index = df[df['Id'] == int(r.metadata['ID'])].index[0]
+    article_df = get_articles(article_index)
+    article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index()
+    article_text = article_intended['Text'][0]
+    if len(article_text) > 5000 :
+      related_txt = related_text(article_text, query, 5000)[0]
+    else:
+      related_txt = article_text
+    meta = r.metadata
+    meta = {
+        #'Title': meta['Title'],
+        'Header' : meta['Article']
+    }
+    info_filtered += f"Article {meta} : {related_txt} \n"
+  return info_filtered
+def related_text(txt, q, size):
+  text_splitter = CharacterTextSplitter(
+      separator = " ",
+      chunk_size = size,
+      chunk_overlap  = 50,
+      length_function = len,
+  )
+  chunks = text_splitter.split_text(txt)
+  embeddings = [generate_embeddings(chunk) for chunk in chunks]
+  def similarity(q):
+      query_embedding = generate_embeddings(q)
+      similarity_scores = util.cos_sim(query_embedding, embeddings)
+      sorted_indices = np.argsort(-similarity_scores)
+      indexes = []
+      indexes.append(int(sorted_indices[0][0]))
+      new_chunks = [chunks[i] for i in indexes]
+      ans = '\n'.join(new_chunks)
+      return new_chunks
+  return similarity(q)
+def format_messages(message_list):
+    formatted_messages = []
+    current_speaker = None
+    for message in message_list:
+        if 'HumanMessage' in str(type(message)):
+            if current_speaker != 'Human':
+                current_speaker = 'Human'
+                formatted_messages.append(f'{current_speaker} : {message.content}')
+            else:
+                formatted_messages[-1] += f' {message.content}'
+        elif 'AIMessage' in str(type(message)):
+            if current_speaker != 'AI':
+                current_speaker = 'AI'
+                formatted_messages.append(f'{current_speaker} : {message.content}')
+            else:
+                formatted_messages[-1] += f' {message.content}'
+    return '\n'.join(formatted_messages)
+def memory_prompt():
+  global history
+  if len (memory.chat_memory.messages) <= 8 :
+    chat_history_lines = format_messages(memory.chat_memory.messages)
+  else:
+    chat_history_lines = format_messages(memory.chat_memory.messages[8:])
+  prompt = f"""
+Current conversation:
+{chat_history_lines}
+  """
+  return prompt
+def update_prompt(human, ai):
+  memory.save_context({"input": human}, {"output": ai})
+  prompt = memory_prompt()
+  #print(prompt)
+  return prompt
+shortener = pyshorteners.Shortener()
+short_url = shortener.tinyurl.short(df['Links'][0])
+mod ="gpt-35-turbo-16k"
+memory = ConversationBufferWindowMemory()
+x=0
+info = ''
+history = ''
+is_locked = False
+is_found = False
+new_session = False
+is_new = False
+captured_ID = ''
+user_intent_text = ''
+full_ans = ''
+prompt = f"""
+Current conversation:
+"""
+def clean_ans (answer):
+    if answer.startswith("Assistant:"):
+        answer = answer[len("Assistant:"):]
+    elif answer.startswith("AI:"):
+        answer = answer[len("AI:"):]
+    elif answer.startswith("AI :"):
+        answer = answer[len("AI :"):]
+    # if answer.startswith("Assistant:"):
+    #     answer = answer[len("Assistant:"):]
+    #     answer = answer[:(len(answer)-len("Assistant:"))]
+    # elif answer.startswith("AI:"):
+    #     answer = answer[len("AI:"):]
+    #     answer = answer[:(len(answer)-len("AI:"))]
+    # elif answer.startswith("AI :"):
+    #     answer = answer[len("AI :"):]
+    #     answer = answer[:(len(answer)-len("AI :"))]
+    return answer
+def user(user_message, history):
+    return "", history + [[user_message, None]]
+def slow_echo(history):
+    global prompt
+    global is_locked
+    global is_found
+    global captured_ID
+    global user_intent_text
+    global x
+    global info
+    global new_session
+    global full_ans
+    global is_new
+    user_message = history[-1][0]
+    my_query = history[-1][0]
+    if x == 0:
+      info = query_df(user_message)
+      x+=1
+    if is_locked == False:
+      SYS_PROMPT = SYS_TEMPLATE.format(info)
+      USER_PROMPT = prompt.rstrip() + f"\nHuman : {user_message}"
+      message_text=[
+        {
+          "role": "system",
+          "content": SYS_PROMPT
+        },
+        {
+          "role": "user",
+          "content": USER_PROMPT
+        },
+      ]
+      stream = client.chat.completions.create(
+          model= mod,
+          messages = message_text,
+          temperature=0.0,
+          max_tokens=1700,
+          top_p=0.95,
+          frequency_penalty=0,
+          presence_penalty=0,
+          stop=None,
+          stream=True,
+      )
+      history[-1][1] = ""
+      full_ans =""
+      cleaned = False
+      is_found = False
+      for chunk in stream:
+          if not chunk.choices:
+              pass
+          else:
+              if chunk.choices[0].delta.content is not None:
+                if is_found == False:
+                  if cleaned == False:
+                    full_ans += chunk.choices[0].delta.content
+                    if len(full_ans) >= 1500 :
+                      cleaned = True
+                      full_ans = clean_ans(full_ans)
+                      if 'id' in full_ans.lower():
+                        is_found = True
+                      else:
+                        for t in full_ans:
+                            time.sleep(0.03)
+                            history[-1][1] += t
+                            yield history
+                  elif cleaned == True:
+                      time.sleep(0.03)
+                      full_ans += chunk.choices[0].delta.content
+                      history[-1][1] += chunk.choices[0].delta.content
+                      yield history
+                else:
+                  full_ans += chunk.choices[0].delta.content
+      if is_found == False:
+        if len(full_ans) <1500 :
+          if 'id' in full_ans.lower():
+            is_found = True
+          else:
+            full_ans = clean_ans(full_ans)
+            for t in full_ans:
+                time.sleep(0.02)
+                history[-1][1] += t
+                yield history
+########################################################################################################
+    else :
+      full_ans = captured_ID
+    if (is_found) or (is_locked) :
+        if not is_locked:
+          pattern = r'\b\d{5}\b'
+          matches = re.findall(pattern, full_ans)
+          captured_ID = matches[0]
+          matched = re.search(r'user(?:intent)?\s*:\s*(.*)', full_ans, re.IGNORECASE)
+          user_intent_text = (matched.group(1).strip())
+          user_intent_text = "".join([x for x in user_intent_text if x not in punctuations])
+          my_query = user_intent_text
+        else:
+          my_query = user_message
+        related_txt = query_df_filtered(my_query, captured_ID)
+        law_df = df[df['Id'] == int(captured_ID)].reset_index()
+        ##################################################################2nd
+        SYS_PROMPT = sys_prompt_intent.format(related_txt)
+        USER_PROMPT = prompt.rstrip() + f"\nHuman : {my_query}"
+        print(SYS_PROMPT)
+        print("-----------------")
+        print(USER_PROMPT)
+        print("-----------------")
+        print(prompt)
+        message_text=[
+          {
+            "role": "system",
+            "content": SYS_PROMPT
+          },
+          {
+            "role": "user",
+            "content": USER_PROMPT
+          },
+        ]
+        stream = client.chat.completions.create(
+            model= mod,
+            messages = message_text,
+            temperature=0.0,
+            max_tokens=1500,
+            top_p=0.95,
+            frequency_penalty=0,
+            presence_penalty=0,
+            stop=None,
+            stream=True,
+        )
+        history[-1][1] = ""
+        full_ans = ''
+        for chunk in stream:
+            if not chunk.choices:
+                pass
+            else:
+                if chunk.choices[0].delta.content is not None:
+                    time.sleep(0.03)
+                    history[-1][1] += clean_ans(chunk.choices[0].delta.content)
+                    full_ans += clean_ans(chunk.choices[0].delta.content)
+                    yield (history)
+  ########################################################################################################
+        if not is_locked:
+          link = shortener.tinyurl.short(law_df['Links'][0])
+          law_links = f"\n\nTopic : {law_df['Topic'][0]}\nLink  : {link}"
+          for chunk in law_links:
+            time.sleep(0.01)
+            history[-1][1] += chunk
+            yield history
+        is_locked = True
+    else:
+      pass
+    prompt = update_prompt(my_query, full_ans)
+def test_function():
+    global new_session
+    global is_locked
+    global is_found
+    global user_intent_text
+    global captured_ID
+    global full_ans
+    global history
+    global info
+    global prompt
+    global x
+    global memory
+    memory = ConversationBufferWindowMemory()
+    new_session = False
+    is_locked = False
+    is_found = False
+    user_intent_text = ''
+    captured_ID = ''
+    full_ans = ''
+    history = ''
+    info = ''
+    x=0
+    prompt = f"""
+    Current conversation:
+    """
+def reset_echo(history):
+  history = [history[0]]
+  yield history
+welcome_message=" مرحبا معك عمار متخصص في موسوعة القوانين لوزارة العدل بالامارات.كيف يمكنني مساعدتك ؟ "
+desc = "البوابة القانونية لوزارة العدل - الامارات العربية المتحدة- القوانين والتشريعات"
+with gr.Blocks(theme=gr.themes.Soft(), title="HI") as demo:
+    with gr.Row():
+        image_path = "https://i.postimg.cc/kgJGhg32/UAE-MOJ-img.png"
+        gr.Image(image_path, height=120, show_download_button=False, show_label= False)
+        gr.Markdown(value=desc, rtl=True)
+    chatbot = gr.Chatbot(value=[(None,welcome_message)],height=350, rtl=True)
+    with gr.Row():
+        msg = gr.Textbox(container=False, min_width=750)
+        submit_btn = gr.Button(value="Submit", variant="primary")
+        submit_btn.click()
+    with gr.Row():
+        new_search = gr.Button(value="بحث جديد")
+        new_search.click(fn=test_function)
+        #gr.ClearButton([msg, chatbot])
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        slow_echo, chatbot, chatbot
+    )
+    submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        slow_echo, chatbot, chatbot
+    )
+    new_search.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        reset_echo, chatbot, chatbot
+    )
+demo.launch(inline=False)