pondsaga's picture
Upload app.py
953aac1 verified
from __future__ import annotations
from typing import Iterable
import gradio as gr
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes
import time
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np
import openai
import gradio as gr
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import FAISS
import glob
import pandas as pd
import re
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken
import base64
import time
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def input_to_image(input_en_sentence):
input_en_sentence = input_en_sentence
sentences = ["SCBGOLD policy", "SCBGOLD risk", "SCBGOLD value", "SCBGOLD price","O.R. OR Stock policy detail what is", "why O.R. OR stock go down news risk", "O.R. OR value", "O.R. OR price"]
image_urls = ['/image/SCBGOLD-Diagram.png',
'/image/SCBGOLD-Risk.png',
'/image/SCBGOLD-Chart.png',
'/image/SCBGOLD-Chart.png',
'/image/OR-Stock.png',
'/image/OR-Risk.jpg',
'/image/OR-Chart.png',
'/image/OR-Chart.png',]
input_embedding = model.encode(input_en_sentence)
sentence_embeddings = model.encode(sentences)
similarity_scores = util.pytorch_cos_sim(input_embedding, sentence_embeddings)
index_max = np.argmax(similarity_scores.numpy()[0])
value_max = np.max(similarity_scores.numpy()[0])
print(similarity_scores)
if value_max >= 0.37:
return image_urls[index_max]
else:
return 'None'
theme_1 = gr.themes.Monochrome(
font=[gr.themes.GoogleFont('Noto Sans Thai'), gr.themes.GoogleFont('Noto Sans Thai'), gr.themes.GoogleFont('Noto Sans Thai'), gr.themes.GoogleFont('Noto Sans Thai')],
).set(
link_text_color='*primary_600',
prose_text_weight='300',
block_label_text_weight='500'
)
css_1 = """
.message-wrap.svelte-1pjfiar>div.svelte-1pjfiar .svelte-1pjfiar:not(.avatar-container) img {
border-radius: 0 !important;
max-height: none !important;
max-width: 40vw !important;
}
.gradio_container {
background: linear-gradient(to right, blue, green);
}
.gallery.svelte-1viwdyg {
color: black;
}
.message.svelte-1pjfiar.svelte-1pjfiar.svelte-1pjfiar {
background: white;
position: relative;
display: flex;
flex-direction: column;
align-self: flex-end;
text-align: left;
background: var(--background-fill-secondary);
width: calc(65% - var(--spacing-xxl));
color: var(--body-text-color);
font-size: var(--text-lg);
line-height: var(--line-lg);
overflow-wrap: break-word;
overflow-x: hidden;
padding-right: calc(var(--spacing-xxl) + var(--spacing-md));
padding: calc(var(--spacing-sm) + var(--spacing-sm));
box-shadow: rgba(0, 0, 0, 0.16) 0px 1px 4px;
border: none;
.img {
border-radius: 0 !important;
max-height: 400px !important;
max-width: none !important;
}
.message-wrap.svelte-1pjfiar>div.svelte-1pjfiar .svelte-1pjfiar:not(.avatar-container) img {
border-radius: 0 !important;
max-height: none !important;
max-width: 40vw !important;
}
.label.svelte-13hsdno.svelte-13hsdno.svelte-13hsdno {
color: black
}
.gradio-container-4-1-2 .prose > *:first-child {
display: flex;
justify-content: center;
font-size: 50px;
font-weight: bold;
margin-top: 2px;
font-family: 'Inter';
}
"""
description="<p>FundLearn Chatbot is your trusted companion on the journey to financial literacy and investment success in Malaysia. Powered by cutting-edge Language Model technology (LLM), FundLearn brings you a seamless and interactive learning experience tailored to the unique landscape of the Malaysian investment market.</p>"
css = css_1
theme = theme_1
openai.api_key = OPENAI_API_KEY # Replace with your key
def predict(message, history):
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
return f"data:image/jpeg;base64,{encoded_string}"
def normalize_text(s, sep_token = " \n "):
s = re.sub(r'\s+', ' ', s).strip()
s = re.sub(r". ,","",s)
s = s.replace("..",".")
s = s.replace(". .",".")
s = s.replace("\n", "")
s = s.strip()
return s
def sim_text(input_text):
pdf_paths = glob.glob('/pdf/*.pdf')
df = pd.DataFrame(columns=['text'])
for path in pdf_paths:
loader = PyPDFLoader(path)
pages = loader.load_and_split()
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
docs = faiss_index.similarity_search(input_text, k=5)
for doc in docs:
df.loc[len(df.index)] = doc.page_content
df['text']= df["text"].apply(lambda x : normalize_text(x))
tokenizer = tiktoken.get_encoding("cl100k_base")
df['n_tokens'] = df["text"].apply(lambda x: len(tokenizer.encode(x)))
df = df[df.n_tokens<8192]
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
df['ada_v2'] = df["text"].apply(lambda x : embeddings.embed_query(x))
embedding = get_embedding(
input_text,
engine="text-embedding-ada-002" # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
)
df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))
res = (
df.sort_values("similarities", ascending=False)
.head(3)
)
top5_text = " \n ".join(res.text[:1].values)
return top5_text
start_time = time.time()
persona = """
You are the good advice investor chatbot teach people to understanding the basics,
risk management strategies, and methods for portfolio diversification.
Be an AI-guided education on these topics, along with some practical tips and advice for getting started in both stock market investing in Bursa Malaysia
"""
history_openai_format = [{"role": "system", "content": persona}]
if len(history)>0:
for human, assistant in history[-1:]:
history_openai_format.append({"role": "user", "content": human })
history_openai_format.append({"role": "assistant", "content":assistant})
history_openai_format.append({"role": "assistant", "content": sim_text(message)})
history_openai_format.append({"role": "user", "content":message})
end_time = time.time()
execution_time = start_time - end_time
print("history Execution time: ",execution_time)
start_time = time.time()
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo-0125',
messages= history_openai_format,
temperature=0.1,
stream=True
)
end_time = time.time()
execution_time = start_time - end_time
print("response Execution time: ",execution_time)
partial_message = ""
for chunk in response:
try:
chunk_message = chunk['choices'][0]['delta']['content'] # extract the message
partial_message = partial_message + chunk_message
yield partial_message
except:
pass
image_path = input_to_image(message)
if image_path != 'None':
base64_image = image_to_base64(image_path)
image_text = f"<br><br><img src='{base64_image}' height='20vh'>"
partial_message += image_text
print('Show image!')
yield partial_message
examples = [
"How can I start investing in the Bursa Malaysia as a beginner?",
"What are some popular investment options available in Malaysia?",
"What are the key factors to consider before investing in a property in Malaysia"
]
gr.ChatInterface(predict,chatbot=gr.Chatbot(height=600),css = css,theme = theme,examples=examples,title='FundLearn Chatbot', description=description ,retry_btn=None,undo_btn=None).queue().launch(share=True, debug=True)