File size: 7,847 Bytes
363dca9 d6dd331 363dca9 63b8443 395b4fd 8d6dddf 395b4fd 8a06648 d6dd331 eb9c0fd 12089f2 363dca9 10dd042 363dca9 81bd534 363dca9 12089f2 1553fd1 d6dd331 f546bb1 ab1510b f546bb1 03bdf09 d65f8d2 03bdf09 455a7e3 d6dd331 eac2860 2366d6a d6dd331 bdec671 2366d6a eac2860 8fb0ee5 bdec671 d6dd331 eac2860 bdec671 a64b522 bf15223 90563e5 5466ff9 24f5bbf ed9f8bd 03bdf09 bf15223 e4d6d02 395b4fd d6dd331 79d8c94 5fabd6a d6dd331 9fd4ca4 009edd4 d6dd331 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os
import tempfile
import pandas as pd
########################################################################################
def find_most_similar_paragraph(input_sentence, text):
# Split the text into paragraphs
paragraphs = text.split("\n\n") # Assuming paragraphs are separated by at least one empty line
# Initialize variables to keep track of similarity
max_similarity = 0
most_similar_paragraph = ""
# Compare input sentence with each paragraph
for paragraph in paragraphs:
# Preprocess the paragraph (convert to lowercase and remove extra spaces)
cleaned_paragraph = paragraph.lower().strip()
# Calculate similarity (you can use other similarity metrics as well)
similarity = len(set(input_sentence.lower().split()) & set(cleaned_paragraph.split())) / len(set(input_sentence.lower().split()))
# Update max similarity if needed
if similarity > max_similarity:
max_similarity = similarity
most_similar_paragraph = paragraph
return most_similar_paragraph
#######################################################################################
def copy_file_content(src_file, dest_file):
try:
shutil.copyfile(src_file, dest_file)
print(f"محتوای فایل {src_file} با موفقیت به {dest_file} کپی شد.")
except FileNotFoundError:
print(f"فایل {src_file} یافت نشد.")
except Exception as e:
print(f"خطا در کپی فایل: {e}")
########################################################################################
def show_path():
uploaded_file = st.file_uploader("فایل را آپلود کنید", type=["csv", "txt"])
if uploaded_file:
# ایجاد یک پوشه موقت
temp_dir = tempfile.mkdtemp()
# ایجاد مسیر فایل آپلود شده در پوشه موقت
path = os.path.join(temp_dir,uploaded_file.name)
# ذخیره محتوای فایل آپلود شده در مسیر
with open(path, "wb") as f:
f.write(uploaded_file.getvalue())
# نمایش مسیر فایل آپلود شده
#st.success(f"مسیر فایل آپلود شده: {path}")
return path
#########################################################
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
#################################################################
def wrap_text_preserve_newlines(text, width=300):
# Split the input text into lines based on newline characters
lines = text.split('\n')
# Wrap each line individually
wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
# Join the wrapped lines back together using newline characters
wrapped_text = '\n'.join(wrapped_lines)
return wrapped_text
########################################################################
# Text Splitter
def text_split(ducuments):
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
return docs
######################################################################
# Embeddings
def embeddings_f():
embeddings = HuggingFaceEmbeddings()
return embeddings
##########################################################################
#upload pdf
def pdf_upload():
pdf_docs=st.file_uploader("Upload your PDF Files",accept_multiple_files=True)
return pdf_docs
##############################################################################
def show(text):
text_show=st.write(text)
return text_show
###########################################################################
def upload_txt_file():
# ایجاد یک ورودی برای آپلود فایل txt
uploaded_file = st.file_uploader("لطفا یک فایل txt را انتخاب کنید", type="txt")
# بررسی اینکه آیا فایلی آپلود شده است یا خیر
if uploaded_file is not None:
# باز کردن فایل با انکد UTF-8 و قرار دادن محتوای آن در یک متغیر به نام text
with open(uploaded_file) as f:
text = f.read()
# برگرداندن متغیر text
return text
else:
# برگرداندن یک پیام خطا اگر فایلی آپلود نشده باشد
return "لطفا یک فایل txt را آپلود کنید"
###############################################################################
def find_most_similar_paragraph(input_sentence):
try:
# Open the text file for reading
with open("d1.txt", 'r', encoding='utf-8') as file:
text = file.read()
# Split the text into paragraphs
paragraphs = text.split("\n\n") # Assuming paragraphs are separated by at least one empty line
# Initialize variables to keep track of similarity
max_similarity = 0
most_similar_paragraph = ""
# Compare input sentence with each paragraph
for paragraph in paragraphs:
# Preprocess the paragraph (convert to lowercase and remove extra spaces)
cleaned_paragraph = paragraph.lower().strip()
# Calculate similarity (you can use other similarity metrics as well)
similarity = len(set(input_sentence.lower().split()) & set(cleaned_paragraph.split())) / len(set(input_sentence.lower().split()))
# Update max similarity if needed
if similarity > max_similarity:
max_similarity = similarity
most_similar_paragraph = paragraph
return most_similar_paragraph
except FileNotFoundError:
return "فایل مورد نظر یافت نشد."
# Call the function
########################################################################################################
def main():
st.set_page_config(page_title="Multiple pdf chat", page_icon=":books:")
#chunk_size=st.text_input("enter a chunk size:")
#chunk_overlap=st.text_input("enter a chunk overlap:")
if st.button("build model"):
with st.spinner("waiting"):
import app2
st.write("compelete build model")
query=st.text_input("enter a question:")
if st.button("answer your question"):
with st.spinner("waiting"):
from app2 import db
from app2 import load_model
from app2 import find_help
out=load_model(query,db)
st.write(out)
#st.write("-------------------------------------------------")
st.write(query)
st.write("************************************************")
with open('d1.txt', 'w', encoding='utf-8') as file:
file.write("")
file.write(out)
result = find_most_similar_paragraph(query)
#st.write("************************************************")
st.write(result)
#find_help(out)
###############
with st.sidebar:
uploaded_file = st.file_uploader("فایل را آپلود کنید", type=["csv", "txt"])
#st.subheader("your document")
#source_path=show_path()
#if st.button("copy process"):
#with st.spinner("processing"):
#app1.copy_file(source_path)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
main()
|