Spaces:

mehdi364
/

Analsys

Sleeping

App Files Files Community

Analsys / app.py

mehdi364

Update app.py

0ee5416 verified over 1 year ago

raw

history blame contribute delete

5.57 kB

	import os
	import streamlit as st
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_community.llms import HuggingFaceHub
	from deep_translator import GoogleTranslator
	import pandas as pd

	# Set API token for HuggingFace
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets.get('huggingface_token', "")

	# Extract text from PDF files
	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text() or ""
	return text

	# Split text into smaller chunks
	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len
	)
	return text_splitter.split_text(text)

	# Create a vector store from text chunks
	def get_vectorstore(text_chunks):
	if not text_chunks:
	st.error("No valid text chunks available for vector store.")
	return None

	model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
	embeddings = HuggingFaceBgeEmbeddings(
	model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"}
	)
	return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

	# Create a conversational retrieval chain
	def get_conversation_chain(vectorstore):
	if not vectorstore:
	st.error("Vector store is not initialized.")
	return None

	llm = HuggingFaceHub(
	repo_id="google/gemma-7b",
	model_kwargs={"temperature": 0.1, "max_length": 2048},
	)
	return ConversationalRetrievalChain.from_llm(
	llm=llm, retriever=vectorstore.as_retriever(), memory={})

	# ایجاد یک تابع ساده برای بازیابی اطلاعات
	def get_retrieval_chain(vectorstore):
	if not vectorstore:
	st.error("Vector store is not initialized.")
	return None

	llm = HuggingFaceHub(
	repo_id="google/gemma-7b",
	model_kwargs={"temperature": 0.1, "max_length": 2048},
	)
	retriever = vectorstore.as_retriever()
	return llm, retriever

	# Handle CSV file upload and process data
	def process_csv_data(csv_file):
	if csv_file is not None:
	df = pd.read_csv(csv_file)
	st.write("نمایش داده‌های فایل CSV:")
	st.write(df)
	# ترکیب تمام داده‌ها در یک متن طولانی
	combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ")
	return combined_text
	return ""

	# Handle user input and translate responses
	def handle_userinput(user_question):
	if "conversation" not in st.session_state or not st.session_state.conversation:
	st.error("Conversation chain is not initialized.")
	return

	try:
	llm = st.session_state.conversation["llm"]
	retriever = st.session_state.conversation["retriever"]
	chat_history = st.session_state.conversation["chat_history"]

	# بازیابی پاسخ
	result = retriever.get_relevant_documents(user_question)
	answer = llm.generate({"question": user_question, "context": result})

	# ذخیره تاریخچه مکالمه
	chat_history.append({"user": user_question, "bot": answer})
	st.session_state.conversation["chat_history"] = chat_history

	for entry in chat_history:
	st.write(f"سوال: {entry['user']}")
	st.write(f"پاسخ: {entry['bot']}")

	except Exception as e:
	st.error(f"خطایی رخ داده است: {str(e)}")


	# Main app
	def main():
	st.set_page_config(page_title="Chat Bot PDFs", page_icon="📚")

	# متن عنوان
	st.title("Chat Bot برای فایل‌های PDF و CSV 📚")

	# آپلود فایل‌های PDF و CSV
	st.sidebar.subheader("آپلود فایل‌ها")
	pdf_docs = st.sidebar.file_uploader("فایل‌های PDF خود را آپلود کنید", accept_multiple_files=True)
	csv_file = st.sidebar.file_uploader("فایل CSV خود را آپلود کنید", type=["csv"])

	if st.sidebar.button("پردازش"):
	with st.spinner("در حال پردازش..."):
	# پردازش PDF
	raw_text = get_pdf_text(pdf_docs) if pdf_docs else ""

	# پردازش CSV
	csv_text = process_csv_data(csv_file) if csv_file else ""

	# ترکیب متن‌های استخراج‌شده
	combined_text = raw_text + csv_text

	if not combined_text.strip():
	st.error("هیچ متنی برای پردازش یافت نشد.")
	return

	text_chunks = get_text_chunks(combined_text)
	if not text_chunks:
	st.error("هیچ بخشی از متن برای بردارسازی یافت نشد.")
	return

	vectorstore = get_vectorstore(text_chunks)
	if vectorstore:
	llm, retriever = get_retrieval_chain(vectorstore)
	st.session_state.conversation = {
	"llm": llm,
	"retriever": retriever,
	"chat_history": [],
	}
	st.success("پردازش تکمیل شد!")


	user_question = st.text_input("سوال خود را وارد کنید:")
	if st.button("پاسخ"):
	handle_userinput(user_question)

	if __name__ == "__main__":
	main()