Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from dotenv import load_dotenv | |
| import PyPDF2 | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import ChatOpenAI | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.chains import ConversationalRetrievalChain | |
| from htmlTemplates import css, bot_template, user_template | |
| from langchain_community.llms import HuggingFaceHub | |
| from langchain_community.vectorstores import Chroma | |
| import pandas as pd | |
| import glob | |
| import os | |
| import re | |
| from PyPDF2 import PdfReader | |
| #tempat vectordb | |
| dirload = '24feb24-openaiv2' | |
| dirsave = "terbaru" | |
| #embeddings | |
| embeddings = OpenAIEmbeddings() | |
| def import_text_file(file_path): | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| text = file.read() | |
| return text | |
| except FileNotFoundError: | |
| print(f"Error: File not found at path: {file_path}") | |
| return "" | |
| except Exception as e: | |
| print(f"Error reading file: {e}") | |
| return "" | |
| def import_text_file(file_path): | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| text = file.read() | |
| return text | |
| except FileNotFoundError: | |
| print(f"Error: File not found at path: {file_path}") | |
| return "" | |
| except Exception as e: | |
| print(f"Error reading file: {e}") | |
| return "" | |
| #list semua pdf dalam direktori | |
| def list_pdf_files_and_save_titles(folder_path): | |
| pdf_file_titles = [] | |
| try: | |
| files = os.listdir(folder_path) | |
| pdf_files = [file for file in files if file.lower().endswith('.pdf')] | |
| for pdf_file in pdf_files: | |
| pdf_file_titles.append(pdf_file) | |
| except FileNotFoundError: | |
| print(f"Folder not found: {folder_path}") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| return pdf_file_titles | |
| #read the document | |
| def extract_text_from_pdf(pdf_path): | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = '' | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text += page.extract_text() + "\n" | |
| return text | |
| def get_text_chunks(text): | |
| text_splitter = CharacterTextSplitter( | |
| separator=" ", | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def get_vectorstore(text_chunks): | |
| # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") | |
| # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) | |
| vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings) | |
| return vectorstore | |
| def get_conversation_chain(vectorstore): | |
| llm = ChatOpenAI() | |
| memory = ConversationBufferMemory( | |
| memory_key='chat_history', return_messages=True) | |
| conversation_chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=vectorstore.as_retriever(), | |
| memory=memory | |
| ) | |
| return conversation_chain | |
| def handle_userinput(user_question): | |
| response = st.session_state.conversation({'question': user_question}) | |
| st.session_state.chat_history = response['chat_history'] | |
| for i, message in enumerate(st.session_state.chat_history): | |
| if i % 2 == 0: | |
| st.write(user_template.replace( | |
| "{{MSG}}", message.content), unsafe_allow_html=True) | |
| else: | |
| st.write(bot_template.replace( | |
| "{{MSG}}", message.content), unsafe_allow_html=True) | |
| def main(): | |
| load_dotenv() | |
| #load vector | |
| vectorstore = Chroma(persist_directory=dir, embedding_function=embeddings) | |
| # create conversation chain | |
| st.session_state.conversation = get_conversation_chain(vectorstore) | |
| st.set_page_config(page_title="Selamat Datang Di Indonesian Climate Bot", | |
| page_icon=":sun_behind_rain_cloud:") | |
| st.write(css, unsafe_allow_html=True) | |
| if "conversation" not in st.session_state: | |
| st.session_state.conversation = None | |
| if "chat_history" not in st.session_state: | |
| st.session_state.chat_history = None | |
| st.header("Indonesian Climate Chatbot :sun_behind_rain_cloud:") | |
| user_question = st.text_input("Tanyakan padaku seputar perubahan iklim:") | |
| if user_question: | |
| handle_userinput(user_question) | |
| with st.sidebar: | |
| st.header(":blue[Jumlah Dokumen dan Berita]") | |
| banyakDokumen = import_text_file("banyakdokumen.txt") | |
| banyakBerita = import_text_file("banyakberita.txt") | |
| #showing the regulation docs | |
| with open("file_titles.txt", "r") as file: | |
| my_list = file.readlines() # Reads all lines into a list | |
| # Remove trailing newlines (if necessary) | |
| file_titles = [item.strip() for item in my_list] | |
| #show pdf files yang dipakai | |
| with st.container(height=300): | |
| s = '' | |
| for i in file_titles: | |
| s += "- " + i + "\n" | |
| st.markdown(s) | |
| st.write("jumlah dokumen regulasi: "+ ":green[{}]".format(banyakDokumen)) | |
| st.write("jumlah dokumen berita: "+ ":green[{}]".format(banyakBerita)) | |
| # st.subheader("Your documents") | |
| # pdf_docs = st.file_uploader( | |
| # "Upload your PDFs here and click on 'Process'", accept_multiple_files=True) | |
| # if st.button("Process"): | |
| # with st.spinner("Processing"): | |
| # # get pdf text | |
| # raw_text = get_pdf_text(pdf_docs) | |
| # # get the text chunks | |
| # text_chunks = get_text_chunks(raw_text) | |
| # # create vector store | |
| # vectorstore = get_vectorstore(text_chunks) | |
| # # create conversation chain | |
| # st.session_state.conversation = get_conversation_chain( | |
| # vectorstore) | |
| if st.button("Re-Processing New Data"): | |
| with st.spinner("Processing..."): | |
| # BERITA | |
| # Find a CSV files in the directory | |
| sumber = glob.glob("berita/*.csv") | |
| df = pd.read_csv(sumber[0]) | |
| banyakBerita = len(df) | |
| print("sumber berita ditemukan") | |
| #update banyak berita txt | |
| with open("banyakBerita.txt", "w") as file: | |
| file.write(str(banyakBerita)) | |
| print("update file text berita berhasil") | |
| #combining and converting | |
| df["combined"] = "" | |
| for row in range(len(df)): | |
| kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n " | |
| df['combined'].loc[row] = kombinasi | |
| listberita = df["combined"].tolist() | |
| textberita = " ".join(listberita) | |
| print("combining and converting berhasil") | |
| # directory ke pdf regulasi | |
| folder_path = 'pdf/' | |
| file_titles = list_pdf_files_and_save_titles(folder_path) | |
| banyakDokumen = len(file_titles) | |
| #saving the file titles | |
| with open("file_titles.txt", "w") as file: | |
| for item in file_titles: | |
| file.write(item + "\n") | |
| #update banyak dokumen txt | |
| with open("banyakDokumen.txt", "w") as file: | |
| file.write(str(banyakDokumen)) | |
| print("update file text dokumen berhasil") | |
| #converting ke text untuk pdf dokument | |
| textdokumen='' | |
| for doc in range(len(file_titles)): | |
| judul = " \n " + "AWAL DOKUMEN KE- "+ str(doc+1) + " \n " | |
| batas = "==========" | |
| akhir = " \n " + "AKHIR DOKUMEN KE- "+ str(doc+1) + " \n " | |
| textdokumen = textdokumen + "{}{}{}{}{}".format(judul,batas,extract_text_from_pdf('pdf/'+file_titles[doc]),batas,akhir) | |
| print("converting ke text untuk pdf dokumen berhasil") | |
| #combine text berita sama dokumen | |
| final = textdokumen | |
| # + textberita | |
| print("combining 2 sumber pelatihan berhasil") | |
| #splitting | |
| texts = get_text_chunks(final) | |
| print("splitting final text berhasil") | |
| #save dengan chroma | |
| vectorstore = Chroma.from_texts(texts, | |
| embeddings, | |
| persist_directory=dirsave) | |
| # persist the db to disk | |
| vectorstore.persist() | |
| vectorstore = None | |
| print("simpan hasil vektor ke chroma berhasil") | |
| st.write(":orange[Pembaharuan Berhasil!]") | |
| # Create an empty placeholder at the bottom | |
| placeholder = st.sidebar.empty() | |
| # Add the label within the placeholder | |
| with placeholder: | |
| st.markdown("**by Oriza Nurfajri**") | |
| if __name__ == '__main__': | |
| main() | |