from __future__ import annotations from typing import Iterable import gradio as gr from gradio.themes.base import Base from gradio.themes.utils import colors, fonts, sizes import time from transformers import pipeline from sentence_transformers import SentenceTransformer, util import numpy as np import openai import gradio as gr import os from langchain.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import OpenAIEmbeddings from langchain.chat_models import ChatOpenAI from langchain.document_loaders import PyPDFLoader from langchain.chains import RetrievalQA from langchain.document_loaders import DirectoryLoader from langchain.vectorstores import FAISS import glob import pandas as pd import re from openai.embeddings_utils import get_embedding, cosine_similarity import tiktoken import base64 import time OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') def input_to_image(input_en_sentence): input_en_sentence = input_en_sentence sentences = ["SCBGOLD policy", "SCBGOLD risk", "SCBGOLD value", "SCBGOLD price","O.R. OR Stock policy detail what is", "why O.R. OR stock go down news risk", "O.R. OR value", "O.R. OR price"] image_urls = ['/image/SCBGOLD-Diagram.png', '/image/SCBGOLD-Risk.png', '/image/SCBGOLD-Chart.png', '/image/SCBGOLD-Chart.png', '/image/OR-Stock.png', '/image/OR-Risk.jpg', '/image/OR-Chart.png', '/image/OR-Chart.png',] input_embedding = model.encode(input_en_sentence) sentence_embeddings = model.encode(sentences) similarity_scores = util.pytorch_cos_sim(input_embedding, sentence_embeddings) index_max = np.argmax(similarity_scores.numpy()[0]) value_max = np.max(similarity_scores.numpy()[0]) print(similarity_scores) if value_max >= 0.37: return image_urls[index_max] else: return 'None' theme_1 = gr.themes.Monochrome( font=[gr.themes.GoogleFont('Noto Sans Thai'), gr.themes.GoogleFont('Noto Sans Thai'), gr.themes.GoogleFont('Noto Sans Thai'), gr.themes.GoogleFont('Noto Sans Thai')], ).set( link_text_color='*primary_600', prose_text_weight='300', block_label_text_weight='500' ) css_1 = """ .message-wrap.svelte-1pjfiar>div.svelte-1pjfiar .svelte-1pjfiar:not(.avatar-container) img { border-radius: 0 !important; max-height: none !important; max-width: 40vw !important; } .gradio_container { background: linear-gradient(to right, blue, green); } .gallery.svelte-1viwdyg { color: black; } .message.svelte-1pjfiar.svelte-1pjfiar.svelte-1pjfiar { background: white; position: relative; display: flex; flex-direction: column; align-self: flex-end; text-align: left; background: var(--background-fill-secondary); width: calc(65% - var(--spacing-xxl)); color: var(--body-text-color); font-size: var(--text-lg); line-height: var(--line-lg); overflow-wrap: break-word; overflow-x: hidden; padding-right: calc(var(--spacing-xxl) + var(--spacing-md)); padding: calc(var(--spacing-sm) + var(--spacing-sm)); box-shadow: rgba(0, 0, 0, 0.16) 0px 1px 4px; border: none; .img { border-radius: 0 !important; max-height: 400px !important; max-width: none !important; } .message-wrap.svelte-1pjfiar>div.svelte-1pjfiar .svelte-1pjfiar:not(.avatar-container) img { border-radius: 0 !important; max-height: none !important; max-width: 40vw !important; } .label.svelte-13hsdno.svelte-13hsdno.svelte-13hsdno { color: black } .gradio-container-4-1-2 .prose > *:first-child { display: flex; justify-content: center; font-size: 50px; font-weight: bold; margin-top: 2px; font-family: 'Inter'; } """ description="

FundLearn Chatbot is your trusted companion on the journey to financial literacy and investment success in Malaysia. Powered by cutting-edge Language Model technology (LLM), FundLearn brings you a seamless and interactive learning experience tailored to the unique landscape of the Malaysian investment market.

" css = css_1 theme = theme_1 openai.api_key = OPENAI_API_KEY # Replace with your key def predict(message, history): def image_to_base64(image_path): with open(image_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode() return f"data:image/jpeg;base64,{encoded_string}" def normalize_text(s, sep_token = " \n "): s = re.sub(r'\s+', ' ', s).strip() s = re.sub(r". ,","",s) s = s.replace("..",".") s = s.replace(". .",".") s = s.replace("\n", "") s = s.strip() return s def sim_text(input_text): pdf_paths = glob.glob('/pdf/*.pdf') df = pd.DataFrame(columns=['text']) for path in pdf_paths: loader = PyPDFLoader(path) pages = loader.load_and_split() faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)) docs = faiss_index.similarity_search(input_text, k=5) for doc in docs: df.loc[len(df.index)] = doc.page_content df['text']= df["text"].apply(lambda x : normalize_text(x)) tokenizer = tiktoken.get_encoding("cl100k_base") df['n_tokens'] = df["text"].apply(lambda x: len(tokenizer.encode(x))) df = df[df.n_tokens<8192] embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) df['ada_v2'] = df["text"].apply(lambda x : embeddings.embed_query(x)) embedding = get_embedding( input_text, engine="text-embedding-ada-002" # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model ) df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding)) res = ( df.sort_values("similarities", ascending=False) .head(3) ) top5_text = " \n ".join(res.text[:1].values) return top5_text start_time = time.time() persona = """ You are the good advice investor chatbot teach people to understanding the basics, risk management strategies, and methods for portfolio diversification. Be an AI-guided education on these topics, along with some practical tips and advice for getting started in both stock market investing in Bursa Malaysia """ history_openai_format = [{"role": "system", "content": persona}] if len(history)>0: for human, assistant in history[-1:]: history_openai_format.append({"role": "user", "content": human }) history_openai_format.append({"role": "assistant", "content":assistant}) history_openai_format.append({"role": "assistant", "content": sim_text(message)}) history_openai_format.append({"role": "user", "content":message}) end_time = time.time() execution_time = start_time - end_time print("history Execution time: ",execution_time) start_time = time.time() response = openai.ChatCompletion.create( model='gpt-3.5-turbo-0125', messages= history_openai_format, temperature=0.1, stream=True ) end_time = time.time() execution_time = start_time - end_time print("response Execution time: ",execution_time) partial_message = "" for chunk in response: try: chunk_message = chunk['choices'][0]['delta']['content'] # extract the message partial_message = partial_message + chunk_message yield partial_message except: pass image_path = input_to_image(message) if image_path != 'None': base64_image = image_to_base64(image_path) image_text = f"

" partial_message += image_text print('Show image!') yield partial_message examples = [ "How can I start investing in the Bursa Malaysia as a beginner?", "What are some popular investment options available in Malaysia?", "What are the key factors to consider before investing in a property in Malaysia" ] gr.ChatInterface(predict,chatbot=gr.Chatbot(height=600),css = css,theme = theme,examples=examples,title='FundLearn Chatbot', description=description ,retry_btn=None,undo_btn=None).queue().launch(share=True, debug=True)