Spaces:
Paused
Paused
| from llama_index.core.response.notebook_utils import display_source_node | |
| from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding | |
| from llama_index.core.query_engine import RetrieverQueryEngine | |
| from llama_index.core import VectorStoreIndex, ServiceContext | |
| from llama_index.core.node_parser import SimpleNodeParser | |
| from llama_index.llms.azure_openai import AzureOpenAI | |
| from llama_index.readers.file import PDFReader | |
| from llama_index.core.schema import IndexNode | |
| from llama_index.core import Document | |
| from langchain_core.messages import HumanMessage | |
| from langchain_openai import AzureChatOpenAI | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.chains import ConversationChain | |
| from langchain.memory import ConversationBufferWindowMemory | |
| from langchain.prompts import PromptTemplate | |
| from sentence_transformers import util | |
| from datasets import load_dataset | |
| from openai import AzureOpenAI | |
| from bs4 import BeautifulSoup | |
| import pyshorteners | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import warnings | |
| import pickle | |
| import string | |
| import json | |
| import time | |
| import ast | |
| import os | |
| import re | |
| client = AzureOpenAI( | |
| azure_endpoint = "https://moj-ada3.openai.azure.com/", | |
| api_key="9639718f1a7d478a9313d2b2aeb5dacc", | |
| api_version="2024-02-15-preview" | |
| ) | |
| df = pd.read_csv("data/Data.csv") | |
| warnings.filterwarnings("ignore") | |
| def extract_title(text): | |
| if '-' in text: | |
| return text.split('-')[-1].strip() | |
| elif '–' in text: | |
| return text.split('–')[-1].strip() | |
| else: | |
| return "" | |
| def remove_title(text): | |
| if '-' in text: | |
| return text.split('-')[0].strip() | |
| elif '–' in text: | |
| return text.split('–')[0].strip() | |
| else: | |
| return text | |
| def get_articles(i): | |
| try: | |
| result_df = pd.DataFrame(columns=['Header', 'Text','Comment']) | |
| #html = df[df['Id'] == 35850]['HTML'][621] | |
| html = df['HTML'][i] | |
| soup = BeautifulSoup(html, 'html.parser') | |
| divs = soup.find_all('div') | |
| h_class = 'x__1575___1604___1605___1575___1583___1577_14' | |
| x = 0 | |
| txt = '' | |
| headers = ast.literal_eval(df['Subjects'][i]) | |
| for d in divs: | |
| try: | |
| if d.get('class') is None: | |
| d_class = d.find('div').get('class')[0] | |
| d_text = d.find('div').text.replace('\n\n',' ').replace('\n',' ') | |
| else: | |
| d_class = d.get('class')[0] | |
| d_text = d.text.replace('\n\n',' ').replace('\n',' ') | |
| if h_class not in d_class: | |
| txt += " " +d_text | |
| else: | |
| if x == 0: | |
| result_df = pd.concat([result_df, pd.DataFrame({'Header': ['Desc'], 'Text': [txt]})], ignore_index=True) | |
| txt = '' | |
| x += 1 | |
| else: | |
| result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True) | |
| txt = '' | |
| x += 1 | |
| except: | |
| pass | |
| result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True) | |
| divs_with_showfn = soup.find_all('div', id=lambda x: x and x.startswith('fn')) | |
| for r in range (result_df.shape[0]): | |
| article = result_df['Header'][r].split('-')[0].strip() | |
| for n,d in enumerate(divs_with_showfn): | |
| edit = d.text.replace('\n\n',' ').replace('\n',' ') | |
| match = edit[:35] | |
| if (article.replace("الأولى","الاولى") in match.replace("الأولى","الاولى")) and ("القديم" in match) : | |
| #result_df['Text'][r] += "\n\n-تعديل-\n\n" + edit | |
| result_df['Comment'][r] = edit | |
| if divs_with_showfn: | |
| firstindex = divs_with_showfn[0].text.replace('\n\n',' ').replace('\n',' ') | |
| last_e = result_df.shape[0] -1 | |
| mada = result_df['Text'][last_e] | |
| if firstindex in mada : | |
| result_df['Text'][last_e] = (mada.split(firstindex)[0]) | |
| #result_df['Title'] = result_df['Header'].apply(extract_title) | |
| #result_df['Header'] = result_df['Header'].apply(remove_title) | |
| return result_df.reset_index(drop=True) | |
| except: | |
| pass | |
| with open('data/ada_base_index_small.pkl', 'rb') as f: | |
| base_index_ = pickle.load(f) | |
| azure_endpoint = "https://moj-ada3.openai.azure.com/" | |
| api_key="9639718f1a7d478a9313d2b2aeb5dacc" | |
| api_version="2024-02-15-preview" | |
| deployment = "gpt-35-turbo-16k" | |
| os.environ["AZURE_OPENAI_API_KEY"] = api_key | |
| os.environ["AZURE_OPENAI_ENDPOINT"] = azure_endpoint | |
| llm_chain = AzureChatOpenAI( | |
| openai_api_version= api_version, | |
| azure_deployment= deployment, | |
| ) | |
| client = AzureOpenAI( | |
| azure_endpoint = "https://moj-ada3.openai.azure.com/", | |
| api_key="9639718f1a7d478a9313d2b2aeb5dacc", | |
| api_version="2024-02-15-preview" | |
| ) | |
| SYS_TEMPLATE = """ | |
| The following is a friendly conversation between a human and an AI. | |
| AI must follow the Instructions below | |
| Instructions: | |
| - AI is an Arabic legal expert in the UAE. | |
| - AI shall always reply in Arabic. | |
| - AI shall never reply in English. | |
| - AI shall not repeat any questions or rephrase them. | |
| - AI shall ask a presise question if needed to determine the user's intent. | |
| - AI shall only ask a maximum of one question if needed to human and then determine his intent. | |
| - AI shall only reply to questions related to law subjects. | |
| - AI shall not answer or explain or give any advice to user questions. | |
| - AI MUST not provide any details ever from given information, only use it to determine the desired intent. | |
| - AI shall use the given information only to ask precise and short question to determine user intent. | |
| - AI shall determine the user desired intent with the minimum number of questions possible. | |
| - AI shall not ask the user again after the user confirms on any question. | |
| - AI shall decide user intent if the user's query contains enough details without asiking him any more questions. | |
| - AI shall decide which suits query better if user wants a general info or says give me anything. | |
| - AI's only purpose is to determine the intended topic from the user. | |
| - AI shall choose node with the best description matching with the human's intent. | |
| - AI shall always end the conversation with the returns below as long as the user question matches with given info. | |
| - if AI asks a question and human says he dosent know the spesific law or article then AI shall determine and end the conversation with the returns below. | |
| - if Human asks a question (Is it permissible (هل يجوز)) AI should find the best node that can answer the question with yes or no. | |
| - AI shall end the conversation when the user confirms his intent and return as mentioned below from node's metadata. | |
| - AI shall mention every detail the user wants in the userintent returns. | |
| - AI MUST include the five digits number in the returns. | |
| - AI shall never leave the ID in returns empty it should always be five digits. | |
| Returns: | |
| [ | |
| ID: five didgits number , | |
| Topic: , | |
| userIntent : | |
| ] | |
| Information: | |
| {} | |
| """ | |
| sys_prompt_intent = """ | |
| The following is a friendly conversation between a human and an AI. | |
| AI must follow the Instructions below | |
| Instructions: | |
| - AI is an Arabic legal expert in the UAE. | |
| - AI shall always reply in Arabic. | |
| - AI shall never reply in English. | |
| - AI shall answer the human questions based on the content provided. | |
| - AI shall answer only from within the Content provided , and NOT from outside. | |
| - AI shall answer using the exact text in content and not improvise. | |
| - AI shall NOT improvise , or give any advices nor explanation. | |
| - AI shall not provide any links to user and tell him to search in it, it should always provide the required info. | |
| - AI shall always answer to the user query in a professional and informative way inculding all the details. | |
| - ِAI shall answer every question asked in the conversation from human in a detailed way. | |
| - AI shall include in the answer the article number (رقم المادة) | |
| Content: | |
| {} | |
| """ | |
| punctuations = string.punctuation | |
| def generate_embeddings(text, model="ada3_small"): | |
| return client.embeddings.create(input = [text], model=model).data[0].embedding | |
| base_retriever = base_index_.as_retriever(similarity_top_k=10) | |
| def query_df(query): | |
| retrievals = base_retriever.retrieve( | |
| query | |
| ) | |
| related_texts = [] | |
| metadatas = [] | |
| info = '' | |
| for i,r in enumerate(retrievals): | |
| article_index = df[df['Id'] == int(r.metadata['ID'])].index[0] | |
| article_df = get_articles(article_index) | |
| article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index() | |
| article_text = article_intended['Text'][0] | |
| if len(article_text) > 800 : | |
| related_txt = related_text(article_text, query, 800)[0] | |
| else: | |
| related_txt = article_text | |
| meta = r.metadata | |
| meta = { | |
| 'Description': meta['Description'], | |
| 'ID': meta['ID'], | |
| #'Title': meta['Title'] | |
| } | |
| info += f"Node Number {i+1} : {related_txt} -- Node MetaData : {meta}\n" | |
| return info | |
| from llama_index.core.vector_stores.types import ExactMatchFilter, MetadataFilters | |
| def query_df_filtered(query,id): | |
| filters = MetadataFilters(filters=[ | |
| ExactMatchFilter( | |
| key="ID", | |
| value=str(id) | |
| ) | |
| ]) | |
| b_retriever = base_index_.as_retriever(similarity_top_k=3, filters=filters) | |
| retrievals = b_retriever.retrieve( | |
| query | |
| ) | |
| related_texts = [] | |
| metadatas = [] | |
| info_filtered = '' | |
| for i,r in enumerate(retrievals): | |
| article_index = df[df['Id'] == int(r.metadata['ID'])].index[0] | |
| article_df = get_articles(article_index) | |
| article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index() | |
| article_text = article_intended['Text'][0] | |
| if len(article_text) > 5000 : | |
| related_txt = related_text(article_text, query, 5000)[0] | |
| else: | |
| related_txt = article_text | |
| meta = r.metadata | |
| meta = { | |
| #'Title': meta['Title'], | |
| 'Header' : meta['Article'] | |
| } | |
| info_filtered += f"Article {meta} : {related_txt} \n" | |
| return info_filtered | |
| def related_text(txt, q, size): | |
| text_splitter = CharacterTextSplitter( | |
| separator = " ", | |
| chunk_size = size, | |
| chunk_overlap = 50, | |
| length_function = len, | |
| ) | |
| chunks = text_splitter.split_text(txt) | |
| embeddings = [generate_embeddings(chunk) for chunk in chunks] | |
| def similarity(q): | |
| query_embedding = generate_embeddings(q) | |
| similarity_scores = util.cos_sim(query_embedding, embeddings) | |
| sorted_indices = np.argsort(-similarity_scores) | |
| indexes = [] | |
| indexes.append(int(sorted_indices[0][0])) | |
| new_chunks = [chunks[i] for i in indexes] | |
| ans = '\n'.join(new_chunks) | |
| return new_chunks | |
| return similarity(q) | |
| def format_messages(message_list): | |
| formatted_messages = [] | |
| current_speaker = None | |
| for message in message_list: | |
| if 'HumanMessage' in str(type(message)): | |
| if current_speaker != 'Human': | |
| current_speaker = 'Human' | |
| formatted_messages.append(f'{current_speaker} : {message.content}') | |
| else: | |
| formatted_messages[-1] += f' {message.content}' | |
| elif 'AIMessage' in str(type(message)): | |
| if current_speaker != 'AI': | |
| current_speaker = 'AI' | |
| formatted_messages.append(f'{current_speaker} : {message.content}') | |
| else: | |
| formatted_messages[-1] += f' {message.content}' | |
| return '\n'.join(formatted_messages) | |
| def memory_prompt(): | |
| global history | |
| if len (memory.chat_memory.messages) <= 8 : | |
| chat_history_lines = format_messages(memory.chat_memory.messages) | |
| else: | |
| chat_history_lines = format_messages(memory.chat_memory.messages[8:]) | |
| prompt = f""" | |
| Current conversation: | |
| {chat_history_lines} | |
| """ | |
| return prompt | |
| def update_prompt(human, ai): | |
| memory.save_context({"input": human}, {"output": ai}) | |
| prompt = memory_prompt() | |
| return prompt | |
| shortener = pyshorteners.Shortener() | |
| short_url = shortener.tinyurl.short(df['Links'][0]) | |
| mod ="gpt-35-turbo-16k" | |
| memory = ConversationBufferWindowMemory() | |
| x=0 | |
| info = '' | |
| history = '' | |
| is_locked = False | |
| is_found = False | |
| new_session = False | |
| is_new = False | |
| captured_ID = '' | |
| user_intent_text = '' | |
| full_ans = '' | |
| prompt = f""" | |
| Current conversation: | |
| """ | |
| def clean_ans (answer): | |
| if answer.startswith("Assistant:"): | |
| answer = answer[len("Assistant:"):] | |
| elif answer.startswith("AI:"): | |
| answer = answer[len("AI:"):] | |
| elif answer.startswith("AI :"): | |
| answer = answer[len("AI :"):] | |
| # if answer.startswith("Assistant:"): | |
| # answer = answer[len("Assistant:"):] | |
| # answer = answer[:(len(answer)-len("Assistant:"))] | |
| # elif answer.startswith("AI:"): | |
| # answer = answer[len("AI:"):] | |
| # answer = answer[:(len(answer)-len("AI:"))] | |
| # elif answer.startswith("AI :"): | |
| # answer = answer[len("AI :"):] | |
| # answer = answer[:(len(answer)-len("AI :"))] | |
| return answer | |
| def user(user_message, history): | |
| return "", history + [[user_message, None]] | |
| def slow_echo(history): | |
| global prompt | |
| global is_locked | |
| global is_found | |
| global captured_ID | |
| global user_intent_text | |
| global x | |
| global info | |
| global new_session | |
| global full_ans | |
| global is_new | |
| user_message = history[-1][0] | |
| my_query = history[-1][0] | |
| if x == 0: | |
| info = query_df(user_message) | |
| x+=1 | |
| if is_locked == False: | |
| SYS_PROMPT = SYS_TEMPLATE.format(info) | |
| USER_PROMPT = prompt.rstrip() + f"\nHuman : {user_message}" | |
| message_text=[ | |
| { | |
| "role": "system", | |
| "content": SYS_PROMPT | |
| }, | |
| { | |
| "role": "user", | |
| "content": USER_PROMPT | |
| }, | |
| ] | |
| stream = client.chat.completions.create( | |
| model= mod, | |
| messages = message_text, | |
| temperature=0.0, | |
| max_tokens=1700, | |
| top_p=0.95, | |
| frequency_penalty=0, | |
| presence_penalty=0, | |
| stop=None, | |
| stream=True, | |
| ) | |
| history[-1][1] = "" | |
| full_ans ="" | |
| cleaned = False | |
| is_found = False | |
| for chunk in stream: | |
| if not chunk.choices: | |
| pass | |
| else: | |
| if chunk.choices[0].delta.content is not None: | |
| if is_found == False: | |
| if cleaned == False: | |
| full_ans += chunk.choices[0].delta.content | |
| if len(full_ans) >= 1500 : | |
| cleaned = True | |
| full_ans = clean_ans(full_ans) | |
| if 'id' in full_ans.lower(): | |
| is_found = True | |
| else: | |
| for t in full_ans: | |
| time.sleep(0.03) | |
| history[-1][1] += t | |
| yield history | |
| elif cleaned == True: | |
| time.sleep(0.03) | |
| full_ans += chunk.choices[0].delta.content | |
| history[-1][1] += chunk.choices[0].delta.content | |
| yield history | |
| else: | |
| full_ans += chunk.choices[0].delta.content | |
| if is_found == False: | |
| if len(full_ans) <1500 : | |
| if 'id' in full_ans.lower(): | |
| is_found = True | |
| else: | |
| full_ans = clean_ans(full_ans) | |
| for t in full_ans: | |
| time.sleep(0.02) | |
| history[-1][1] += t | |
| yield history | |
| ######################################################################################################## | |
| else : | |
| full_ans = captured_ID | |
| if (is_found) or (is_locked) : | |
| if not is_locked: | |
| pattern = r'\b\d{5}\b' | |
| matches = re.findall(pattern, full_ans) | |
| captured_ID = matches[0] | |
| matched = re.search(r'user(?:intent)?\s*:\s*(.*)', full_ans, re.IGNORECASE) | |
| user_intent_text = (matched.group(1).strip()) | |
| user_intent_text = "".join([x for x in user_intent_text if x not in punctuations]) | |
| my_query = user_intent_text | |
| else: | |
| my_query = user_message | |
| related_txt = query_df_filtered(my_query, captured_ID) | |
| law_df = df[df['Id'] == int(captured_ID)].reset_index() | |
| ##################################################################2nd | |
| SYS_PROMPT = sys_prompt_intent.format(related_txt) | |
| USER_PROMPT = prompt.rstrip() + f"\nHuman : {my_query}" | |
| message_text=[ | |
| { | |
| "role": "system", | |
| "content": SYS_PROMPT | |
| }, | |
| { | |
| "role": "user", | |
| "content": USER_PROMPT | |
| }, | |
| ] | |
| stream = client.chat.completions.create( | |
| model= mod, | |
| messages = message_text, | |
| temperature=0.0, | |
| max_tokens=1500, | |
| top_p=0.95, | |
| frequency_penalty=0, | |
| presence_penalty=0, | |
| stop=None, | |
| stream=True, | |
| ) | |
| history[-1][1] = "" | |
| full_ans = '' | |
| for chunk in stream: | |
| if not chunk.choices: | |
| pass | |
| else: | |
| if chunk.choices[0].delta.content is not None: | |
| time.sleep(0.03) | |
| history[-1][1] += clean_ans(chunk.choices[0].delta.content) | |
| full_ans += clean_ans(chunk.choices[0].delta.content) | |
| yield (history) | |
| ######################################################################################################## | |
| if not is_locked: | |
| link = shortener.tinyurl.short(law_df['Links'][0]) | |
| law_links = f"\n\nTopic : {law_df['Topic'][0]}\nLink : {link}" | |
| for chunk in law_links: | |
| time.sleep(0.01) | |
| history[-1][1] += chunk | |
| yield history | |
| is_locked = True | |
| else: | |
| pass | |
| prompt = update_prompt(my_query, full_ans) | |
| def test_function(): | |
| global new_session | |
| global is_locked | |
| global is_found | |
| global user_intent_text | |
| global captured_ID | |
| global full_ans | |
| global history | |
| global info | |
| global prompt | |
| global x | |
| global memory | |
| memory = ConversationBufferWindowMemory() | |
| new_session = False | |
| is_locked = False | |
| is_found = False | |
| user_intent_text = '' | |
| captured_ID = '' | |
| full_ans = '' | |
| history = '' | |
| info = '' | |
| x=0 | |
| prompt = f""" | |
| Current conversation: | |
| """ | |
| def reset_echo(history): | |
| history = [history[0]] | |
| yield history | |
| welcome_message=" مرحبا معك عمار متخصص في موسوعة القوانين لوزارة العدل بالامارات.كيف يمكنني مساعدتك ؟ " | |
| desc = "البوابة القانونية لوزارة العدل - الامارات العربية المتحدة- القوانين والتشريعات" | |
| with gr.Blocks(theme=gr.themes.Soft(), title="HI") as demo: | |
| with gr.Row(): | |
| image_path = "https://i.postimg.cc/kgJGhg32/UAE-MOJ-img.png" | |
| gr.Image(image_path, height=120, show_download_button=False, show_label= False) | |
| gr.Markdown(value=desc, rtl=True) | |
| chatbot = gr.Chatbot(value=[(None,welcome_message)],height=350, rtl=True) | |
| with gr.Row(): | |
| msg = gr.Textbox(container=False, min_width=750) | |
| submit_btn = gr.Button(value="Submit", variant="primary") | |
| submit_btn.click() | |
| with gr.Row(): | |
| new_search = gr.Button(value="بحث جديد") | |
| new_search.click(fn=test_function) | |
| #gr.ClearButton([msg, chatbot]) | |
| msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
| slow_echo, chatbot, chatbot | |
| ) | |
| submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
| slow_echo, chatbot, chatbot | |
| ) | |
| new_search.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
| reset_echo, chatbot, chatbot | |
| ) | |
| demo.launch(inline=False) |