| import os |
| import streamlit as st |
| from PyPDF2 import PdfReader |
| from langchain.text_splitter import CharacterTextSplitter |
| from langchain.embeddings import HuggingFaceBgeEmbeddings |
| from langchain_community.vectorstores import FAISS |
| from langchain_community.llms import HuggingFaceHub |
| from deep_translator import GoogleTranslator |
| import pandas as pd |
|
|
| |
| os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets.get('huggingface_token', "") |
|
|
| |
| def get_pdf_text(pdf_docs): |
| text = "" |
| for pdf in pdf_docs: |
| pdf_reader = PdfReader(pdf) |
| for page in pdf_reader.pages: |
| text += page.extract_text() or "" |
| return text |
|
|
| |
| def get_text_chunks(text): |
| text_splitter = CharacterTextSplitter( |
| separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len |
| ) |
| return text_splitter.split_text(text) |
|
|
| |
| def get_vectorstore(text_chunks): |
| if not text_chunks: |
| st.error("No valid text chunks available for vector store.") |
| return None |
|
|
| model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" |
| embeddings = HuggingFaceBgeEmbeddings( |
| model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"} |
| ) |
| return FAISS.from_texts(texts=text_chunks, embedding=embeddings) |
|
|
| |
| def get_conversation_chain(vectorstore): |
| if not vectorstore: |
| st.error("Vector store is not initialized.") |
| return None |
|
|
| llm = HuggingFaceHub( |
| repo_id="google/gemma-7b", |
| model_kwargs={"temperature": 0.1, "max_length": 2048}, |
| ) |
| return ConversationalRetrievalChain.from_llm( |
| llm=llm, retriever=vectorstore.as_retriever(), memory={}) |
|
|
| |
| def get_retrieval_chain(vectorstore): |
| if not vectorstore: |
| st.error("Vector store is not initialized.") |
| return None |
|
|
| llm = HuggingFaceHub( |
| repo_id="google/gemma-7b", |
| model_kwargs={"temperature": 0.1, "max_length": 2048}, |
| ) |
| retriever = vectorstore.as_retriever() |
| return llm, retriever |
| |
| |
| def process_csv_data(csv_file): |
| if csv_file is not None: |
| df = pd.read_csv(csv_file) |
| st.write("نمایش دادههای فایل CSV:") |
| st.write(df) |
| |
| combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ") |
| return combined_text |
| return "" |
|
|
| |
| def handle_userinput(user_question): |
| if "conversation" not in st.session_state or not st.session_state.conversation: |
| st.error("Conversation chain is not initialized.") |
| return |
|
|
| try: |
| llm = st.session_state.conversation["llm"] |
| retriever = st.session_state.conversation["retriever"] |
| chat_history = st.session_state.conversation["chat_history"] |
|
|
| |
| result = retriever.get_relevant_documents(user_question) |
| answer = llm.generate({"question": user_question, "context": result}) |
|
|
| |
| chat_history.append({"user": user_question, "bot": answer}) |
| st.session_state.conversation["chat_history"] = chat_history |
|
|
| for entry in chat_history: |
| st.write(f"سوال: {entry['user']}") |
| st.write(f"پاسخ: {entry['bot']}") |
|
|
| except Exception as e: |
| st.error(f"خطایی رخ داده است: {str(e)}") |
|
|
|
|
| |
| def main(): |
| st.set_page_config(page_title="Chat Bot PDFs", page_icon="📚") |
|
|
| |
| st.title("Chat Bot برای فایلهای PDF و CSV 📚") |
|
|
| |
| st.sidebar.subheader("آپلود فایلها") |
| pdf_docs = st.sidebar.file_uploader("فایلهای PDF خود را آپلود کنید", accept_multiple_files=True) |
| csv_file = st.sidebar.file_uploader("فایل CSV خود را آپلود کنید", type=["csv"]) |
|
|
| if st.sidebar.button("پردازش"): |
| with st.spinner("در حال پردازش..."): |
| |
| raw_text = get_pdf_text(pdf_docs) if pdf_docs else "" |
| |
| |
| csv_text = process_csv_data(csv_file) if csv_file else "" |
| |
| |
| combined_text = raw_text + csv_text |
|
|
| if not combined_text.strip(): |
| st.error("هیچ متنی برای پردازش یافت نشد.") |
| return |
|
|
| text_chunks = get_text_chunks(combined_text) |
| if not text_chunks: |
| st.error("هیچ بخشی از متن برای بردارسازی یافت نشد.") |
| return |
|
|
| vectorstore = get_vectorstore(text_chunks) |
| if vectorstore: |
| llm, retriever = get_retrieval_chain(vectorstore) |
| st.session_state.conversation = { |
| "llm": llm, |
| "retriever": retriever, |
| "chat_history": [], |
| } |
| st.success("پردازش تکمیل شد!") |
|
|
|
|
| user_question = st.text_input("سوال خود را وارد کنید:") |
| if st.button("پاسخ"): |
| handle_userinput(user_question) |
|
|
| if __name__ == "__main__": |
| main() |
|
|