Spaces:

UAEBot
/

LlamaIndex_Model

Paused

LlamaIndex_Model / app.py

Eslam Magdy

Update app.py

a5561d7 verified over 1 year ago

20 kB

	from llama_index.core.response.notebook_utils import display_source_node
	from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
	from llama_index.core.query_engine import RetrieverQueryEngine
	from llama_index.core import VectorStoreIndex, ServiceContext
	from llama_index.core.node_parser import SimpleNodeParser
	from llama_index.llms.azure_openai import AzureOpenAI
	from llama_index.readers.file import PDFReader
	from llama_index.core.schema import IndexNode
	from llama_index.core import Document

	from langchain_core.messages import HumanMessage
	from langchain_openai import AzureChatOpenAI
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chains import ConversationChain
	from langchain.memory import ConversationBufferWindowMemory
	from langchain.prompts import PromptTemplate

	from sentence_transformers import util
	from datasets import load_dataset
	from openai import AzureOpenAI
	from bs4 import BeautifulSoup
	import pyshorteners
	import gradio as gr
	import pandas as pd
	import numpy as np
	import warnings
	import pickle
	import string
	import json
	import time
	import ast
	import os
	import re


	client = AzureOpenAI(
	azure_endpoint = "https://moj-ada3.openai.azure.com/",
	api_key="9639718f1a7d478a9313d2b2aeb5dacc",
	api_version="2024-02-15-preview"
	)

	df = pd.read_csv("data/Data.csv")

	warnings.filterwarnings("ignore")

	def extract_title(text):
	if '-' in text:
	return text.split('-')[-1].strip()
	elif '–' in text:
	return text.split('–')[-1].strip()
	else:
	return ""

	def remove_title(text):
	if '-' in text:
	return text.split('-')[0].strip()
	elif '–' in text:
	return text.split('–')[0].strip()
	else:
	return text

	def get_articles(i):
	try:

	result_df = pd.DataFrame(columns=['Header', 'Text','Comment'])

	#html = df[df['Id'] == 35850]['HTML'][621]
	html = df['HTML'][i]

	soup = BeautifulSoup(html, 'html.parser')

	divs = soup.find_all('div')

	h_class = 'x__1575___1604___1605___1575___1583___1577_14'

	x = 0

	txt = ''

	headers = ast.literal_eval(df['Subjects'][i])

	for d in divs:
	try:
	if d.get('class') is None:
	d_class = d.find('div').get('class')[0]
	d_text = d.find('div').text.replace('\n\n',' ').replace('\n',' ')
	else:
	d_class = d.get('class')[0]
	d_text = d.text.replace('\n\n',' ').replace('\n',' ')

	if h_class not in d_class:
	txt += " " +d_text
	else:
	if x == 0:
	result_df = pd.concat([result_df, pd.DataFrame({'Header': ['Desc'], 'Text': [txt]})], ignore_index=True)
	txt = ''
	x += 1
	else:
	result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True)
	txt = ''
	x += 1
	except:
	pass
	result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True)

	divs_with_showfn = soup.find_all('div', id=lambda x: x and x.startswith('fn'))

	for r in range (result_df.shape[0]):
	article = result_df['Header'][r].split('-')[0].strip()

	for n,d in enumerate(divs_with_showfn):
	edit = d.text.replace('\n\n',' ').replace('\n',' ')

	match = edit[:35]

	if (article.replace("الأولى","الاولى") in match.replace("الأولى","الاولى")) and ("القديم" in match) :
	#result_df['Text'][r] += "\n\n-تعديل-\n\n" + edit
	result_df['Comment'][r] = edit

	if divs_with_showfn:

	firstindex = divs_with_showfn[0].text.replace('\n\n',' ').replace('\n',' ')

	last_e = result_df.shape[0] -1

	mada = result_df['Text'][last_e]

	if firstindex in mada :
	result_df['Text'][last_e] = (mada.split(firstindex)[0])

	#result_df['Title'] = result_df['Header'].apply(extract_title)
	#result_df['Header'] = result_df['Header'].apply(remove_title)
	return result_df.reset_index(drop=True)

	except:
	pass



	with open('data/ada_base_index_small.pkl', 'rb') as f:
	base_index_ = pickle.load(f)

	azure_endpoint = "https://moj-ada3.openai.azure.com/"
	api_key="9639718f1a7d478a9313d2b2aeb5dacc"
	api_version="2024-02-15-preview"
	deployment = "gpt-35-turbo-16k"


	os.environ["AZURE_OPENAI_API_KEY"] = api_key
	os.environ["AZURE_OPENAI_ENDPOINT"] = azure_endpoint

	llm_chain = AzureChatOpenAI(
	openai_api_version= api_version,
	azure_deployment= deployment,

	)



	client = AzureOpenAI(
	azure_endpoint = "https://moj-ada3.openai.azure.com/",
	api_key="9639718f1a7d478a9313d2b2aeb5dacc",
	api_version="2024-02-15-preview"
	)


	SYS_TEMPLATE = """
	The following is a friendly conversation between a human and an AI.
	AI must follow the Instructions below

	Instructions:
	- AI is an Arabic legal expert in the UAE.
	- AI shall always reply in Arabic.
	- AI shall never reply in English.
	- AI shall not repeat any questions or rephrase them.
	- AI shall ask a presise question if needed to determine the user's intent.
	- AI shall only ask a maximum of one question if needed to human and then determine his intent.
	- AI shall only reply to questions related to law subjects.
	- AI shall not answer or explain or give any advice to user questions.
	- AI MUST not provide any details ever from given information, only use it to determine the desired intent.
	- AI shall use the given information only to ask precise and short question to determine user intent.
	- AI shall determine the user desired intent with the minimum number of questions possible.
	- AI shall not ask the user again after the user confirms on any question.
	- AI shall decide user intent if the user's query contains enough details without asiking him any more questions.
	- AI shall decide which suits query better if user wants a general info or says give me anything.
	- AI's only purpose is to determine the intended topic from the user.
	- AI shall choose node with the best description matching with the human's intent.
	- AI shall always end the conversation with the returns below as long as the user question matches with given info.
	- if AI asks a question and human says he dosent know the spesific law or article then AI shall determine and end the conversation with the returns below.
	- if Human asks a question (Is it permissible (هل يجوز)) AI should find the best node that can answer the question with yes or no.
	- AI shall end the conversation when the user confirms his intent and return as mentioned below from node's metadata.
	- AI shall mention every detail the user wants in the userintent returns.
	- AI MUST include the five digits number in the returns.
	- AI shall never leave the ID in returns empty it should always be five digits.

	Returns:
	[
	ID: five didgits number ,
	Topic: ,
	userIntent :
	]

	Information:
	{}
	"""
	sys_prompt_intent = """
	The following is a friendly conversation between a human and an AI.
	AI must follow the Instructions below

	Instructions:
	- AI is an Arabic legal expert in the UAE.
	- AI shall always reply in Arabic.
	- AI shall never reply in English.
	- AI shall answer the human questions based on the content provided.
	- AI shall answer only from within the Content provided , and NOT from outside.
	- AI shall answer using the exact text in content and not improvise.
	- AI shall NOT improvise , or give any advices nor explanation.
	- AI shall not provide any links to user and tell him to search in it, it should always provide the required info.
	- AI shall always answer to the user query in a professional and informative way inculding all the details.
	- ِAI shall answer every question asked in the conversation from human in a detailed way.
	- AI shall include in the answer the article number (رقم المادة)

	Content:
	{}
	"""


	punctuations = string.punctuation

	def generate_embeddings(text, model="ada3_small"):

	return client.embeddings.create(input = [text], model=model).data[0].embedding

	base_retriever = base_index_.as_retriever(similarity_top_k=10)

	def query_df(query):

	retrievals = base_retriever.retrieve(
	query
	)

	related_texts = []
	metadatas = []

	info = ''
	for i,r in enumerate(retrievals):
	article_index = df[df['Id'] == int(r.metadata['ID'])].index[0]

	article_df = get_articles(article_index)

	article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index()

	article_text = article_intended['Text'][0]

	if len(article_text) > 800 :
	related_txt = related_text(article_text, query, 800)[0]

	else:
	related_txt = article_text

	meta = r.metadata

	meta = {
	'Description': meta['Description'],
	'ID': meta['ID'],
	#'Title': meta['Title']
	}

	info += f"Node Number {i+1} : {related_txt} -- Node MetaData : {meta}\n"

	return info

	from llama_index.core.vector_stores.types import ExactMatchFilter, MetadataFilters

	def query_df_filtered(query,id):

	filters = MetadataFilters(filters=[
	ExactMatchFilter(
	key="ID",
	value=str(id)
	)
	])

	b_retriever = base_index_.as_retriever(similarity_top_k=3, filters=filters)

	retrievals = b_retriever.retrieve(
	query
	)

	related_texts = []
	metadatas = []

	info_filtered = ''

	for i,r in enumerate(retrievals):

	article_index = df[df['Id'] == int(r.metadata['ID'])].index[0]

	article_df = get_articles(article_index)

	article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index()

	article_text = article_intended['Text'][0]

	if len(article_text) > 5000 :
	related_txt = related_text(article_text, query, 5000)[0]

	else:
	related_txt = article_text

	meta = r.metadata

	meta = {
	#'Title': meta['Title'],
	'Header' : meta['Article']
	}

	info_filtered += f"Article {meta} : {related_txt} \n"

	return info_filtered


	def related_text(txt, q, size):

	text_splitter = CharacterTextSplitter(
	separator = " ",
	chunk_size = size,
	chunk_overlap = 50,
	length_function = len,
	)

	chunks = text_splitter.split_text(txt)

	embeddings = [generate_embeddings(chunk) for chunk in chunks]

	def similarity(q):
	query_embedding = generate_embeddings(q)

	similarity_scores = util.cos_sim(query_embedding, embeddings)

	sorted_indices = np.argsort(-similarity_scores)

	indexes = []

	indexes.append(int(sorted_indices[0][0]))

	new_chunks = [chunks[i] for i in indexes]

	ans = '\n'.join(new_chunks)
	return new_chunks

	return similarity(q)


	def format_messages(message_list):
	formatted_messages = []
	current_speaker = None

	for message in message_list:
	if 'HumanMessage' in str(type(message)):
	if current_speaker != 'Human':
	current_speaker = 'Human'
	formatted_messages.append(f'{current_speaker} : {message.content}')
	else:
	formatted_messages[-1] += f' {message.content}'
	elif 'AIMessage' in str(type(message)):
	if current_speaker != 'AI':
	current_speaker = 'AI'
	formatted_messages.append(f'{current_speaker} : {message.content}')
	else:
	formatted_messages[-1] += f' {message.content}'

	return '\n'.join(formatted_messages)


	def memory_prompt():
	global history
	if len (memory.chat_memory.messages) <= 8 :

	chat_history_lines = format_messages(memory.chat_memory.messages)
	else:

	chat_history_lines = format_messages(memory.chat_memory.messages[8:])
	prompt = f"""
	Current conversation:
	{chat_history_lines}
	"""
	return prompt

	def update_prompt(human, ai):
	memory.save_context({"input": human}, {"output": ai})
	prompt = memory_prompt()

	return prompt

	shortener = pyshorteners.Shortener()

	short_url = shortener.tinyurl.short(df['Links'][0])

	mod ="gpt-35-turbo-16k"

	memory = ConversationBufferWindowMemory()
	x=0
	info = ''
	history = ''
	is_locked = False
	is_found = False
	new_session = False
	is_new = False

	captured_ID = ''
	user_intent_text = ''
	full_ans = ''

	prompt = f"""
	Current conversation:
	"""

	def clean_ans (answer):
	if answer.startswith("Assistant:"):
	answer = answer[len("Assistant:"):]

	elif answer.startswith("AI:"):
	answer = answer[len("AI:"):]

	elif answer.startswith("AI :"):
	answer = answer[len("AI :"):]

	# if answer.startswith("Assistant:"):
	# answer = answer[len("Assistant:"):]
	# answer = answer[:(len(answer)-len("Assistant:"))]

	# elif answer.startswith("AI:"):
	# answer = answer[len("AI:"):]
	# answer = answer[:(len(answer)-len("AI:"))]

	# elif answer.startswith("AI :"):
	# answer = answer[len("AI :"):]
	# answer = answer[:(len(answer)-len("AI :"))]

	return answer

	def user(user_message, history):
	return "", history + [[user_message, None]]

	def slow_echo(history):
	global prompt
	global is_locked
	global is_found
	global captured_ID
	global user_intent_text
	global x
	global info
	global new_session
	global full_ans
	global is_new

	user_message = history[-1][0]
	my_query = history[-1][0]

	if x == 0:
	info = query_df(user_message)
	x+=1

	if is_locked == False:

	SYS_PROMPT = SYS_TEMPLATE.format(info)
	USER_PROMPT = prompt.rstrip() + f"\nHuman : {user_message}"

	message_text=[
	{
	"role": "system",
	"content": SYS_PROMPT
	},
	{
	"role": "user",
	"content": USER_PROMPT
	},

	]

	stream = client.chat.completions.create(
	model= mod,
	messages = message_text,
	temperature=0.0,
	max_tokens=1700,
	top_p=0.95,
	frequency_penalty=0,
	presence_penalty=0,
	stop=None,
	stream=True,
	)

	history[-1][1] = ""
	full_ans =""
	cleaned = False
	is_found = False

	for chunk in stream:
	if not chunk.choices:
	pass
	else:
	if chunk.choices[0].delta.content is not None:
	if is_found == False:

	if cleaned == False:

	full_ans += chunk.choices[0].delta.content


	if len(full_ans) >= 1500 :
	cleaned = True

	full_ans = clean_ans(full_ans)

	if 'id' in full_ans.lower():
	is_found = True

	else:

	for t in full_ans:
	time.sleep(0.03)
	history[-1][1] += t
	yield history

	elif cleaned == True:
	time.sleep(0.03)
	full_ans += chunk.choices[0].delta.content
	history[-1][1] += chunk.choices[0].delta.content
	yield history

	else:
	full_ans += chunk.choices[0].delta.content

	if is_found == False:
	if len(full_ans) <1500 :
	if 'id' in full_ans.lower():
	is_found = True
	else:
	full_ans = clean_ans(full_ans)
	for t in full_ans:
	time.sleep(0.02)
	history[-1][1] += t

	yield history

	########################################################################################################
	else :
	full_ans = captured_ID

	if (is_found) or (is_locked) :

	if not is_locked:
	pattern = r'\b\d{5}\b'

	matches = re.findall(pattern, full_ans)

	captured_ID = matches[0]

	matched = re.search(r'user(?:intent)?\s:\s(.*)', full_ans, re.IGNORECASE)
	user_intent_text = (matched.group(1).strip())
	user_intent_text = "".join([x for x in user_intent_text if x not in punctuations])

	my_query = user_intent_text

	else:

	my_query = user_message

	related_txt = query_df_filtered(my_query, captured_ID)

	law_df = df[df['Id'] == int(captured_ID)].reset_index()

	##################################################################2nd
	SYS_PROMPT = sys_prompt_intent.format(related_txt)
	USER_PROMPT = prompt.rstrip() + f"\nHuman : {my_query}"


	message_text=[
	{
	"role": "system",
	"content": SYS_PROMPT
	},
	{
	"role": "user",
	"content": USER_PROMPT
	},

	]

	stream = client.chat.completions.create(
	model= mod,
	messages = message_text,
	temperature=0.0,
	max_tokens=1500,
	top_p=0.95,
	frequency_penalty=0,
	presence_penalty=0,
	stop=None,
	stream=True,
	)

	history[-1][1] = ""
	full_ans = ''
	for chunk in stream:
	if not chunk.choices:
	pass
	else:
	if chunk.choices[0].delta.content is not None:

	time.sleep(0.03)

	history[-1][1] += clean_ans(chunk.choices[0].delta.content)
	full_ans += clean_ans(chunk.choices[0].delta.content)

	yield (history)
	########################################################################################################

	if not is_locked:

	link = shortener.tinyurl.short(law_df['Links'][0])
	law_links = f"\n\nTopic : {law_df['Topic'][0]}\nLink : {link}"

	for chunk in law_links:

	time.sleep(0.01)

	history[-1][1] += chunk

	yield history

	is_locked = True

	else:
	pass

	prompt = update_prompt(my_query, full_ans)


	def test_function():
	global new_session
	global is_locked
	global is_found
	global user_intent_text
	global captured_ID
	global full_ans
	global history
	global info
	global prompt
	global x

	global memory

	memory = ConversationBufferWindowMemory()

	new_session = False
	is_locked = False
	is_found = False

	user_intent_text = ''
	captured_ID = ''
	full_ans = ''
	history = ''
	info = ''
	x=0

	prompt = f"""
	Current conversation:
	"""

	def reset_echo(history):
	history = [history[0]]
	yield history

	welcome_message=" مرحبا معك عمار متخصص في موسوعة القوانين لوزارة العدل بالامارات.كيف يمكنني مساعدتك ؟ "
	desc = "البوابة القانونية لوزارة العدل - الامارات العربية المتحدة- القوانين والتشريعات"

	with gr.Blocks(theme=gr.themes.Soft(), title="HI") as demo:
	with gr.Row():
	image_path = "https://i.postimg.cc/kgJGhg32/UAE-MOJ-img.png"
	gr.Image(image_path, height=120, show_download_button=False, show_label= False)
	gr.Markdown(value=desc, rtl=True)


	chatbot = gr.Chatbot(value=[(None,welcome_message)],height=350, rtl=True)

	with gr.Row():

	msg = gr.Textbox(container=False, min_width=750)

	submit_btn = gr.Button(value="Submit", variant="primary")
	submit_btn.click()

	with gr.Row():
	new_search = gr.Button(value="بحث جديد")
	new_search.click(fn=test_function)

	#gr.ClearButton([msg, chatbot])

	msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
	slow_echo, chatbot, chatbot
	)
	submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
	slow_echo, chatbot, chatbot
	)

	new_search.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
	reset_echo, chatbot, chatbot
	)
	demo.launch(inline=False)