Spaces:

ali121300
/

st_2

Sleeping

File size: 7,847 Bytes

363dca9
d6dd331
 
 
 
 
363dca9
63b8443
395b4fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d6dddf
395b4fd
8a06648
 
 
 
 
 
 
 
 
 
d6dd331
eb9c0fd
12089f2
363dca9
 
10dd042
363dca9
 
81bd534
363dca9
 
 
 
 
 
12089f2
1553fd1
d6dd331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f546bb1
 
 
 
 
 
 
ab1510b
f546bb1
 
 
 
 
 
 
03bdf09
 
 
d65f8d2
03bdf09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455a7e3
d6dd331
 
 
eac2860
 
2366d6a
d6dd331
bdec671
2366d6a
eac2860
8fb0ee5
bdec671
d6dd331
 
 
 
eac2860
bdec671
 
 
a64b522
bf15223
90563e5
5466ff9
24f5bbf
ed9f8bd
 
03bdf09
bf15223
e4d6d02
395b4fd
d6dd331
79d8c94
 
5fabd6a
 
 
 
 
d6dd331
9fd4ca4
009edd4
d6dd331


import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os
import  tempfile
import pandas as pd
########################################################################################
def find_most_similar_paragraph(input_sentence, text):
    # Split the text into paragraphs
    paragraphs = text.split("\n\n")  # Assuming paragraphs are separated by at least one empty line

    # Initialize variables to keep track of similarity
    max_similarity = 0
    most_similar_paragraph = ""

    # Compare input sentence with each paragraph
    for paragraph in paragraphs:
        # Preprocess the paragraph (convert to lowercase and remove extra spaces)
        cleaned_paragraph = paragraph.lower().strip()

        # Calculate similarity (you can use other similarity metrics as well)
        similarity = len(set(input_sentence.lower().split()) & set(cleaned_paragraph.split())) / len(set(input_sentence.lower().split()))

        # Update max similarity if needed
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_paragraph = paragraph
    
    return most_similar_paragraph
#######################################################################################
def copy_file_content(src_file, dest_file):
    try:
        shutil.copyfile(src_file, dest_file)
        print(f"محتوای فایل {src_file} با موفقیت به {dest_file} کپی شد.")
    except FileNotFoundError:
        print(f"فایل {src_file} یافت نشد.")
    except Exception as e:
        print(f"خطا در کپی فایل: {e}")
########################################################################################

def show_path():
    uploaded_file = st.file_uploader("فایل را آپلود کنید", type=["csv", "txt"])
    if uploaded_file:
        # ایجاد یک پوشه موقت
        temp_dir = tempfile.mkdtemp()
        
        # ایجاد مسیر فایل آپلود شده در پوشه موقت
        path = os.path.join(temp_dir,uploaded_file.name)
        
        # ذخیره محتوای فایل آپلود شده در مسیر
        with open(path, "wb") as f:
            f.write(uploaded_file.getvalue())
        
        # نمایش مسیر فایل آپلود شده
        #st.success(f"مسیر فایل آپلود شده: {path}")
    return path

#########################################################    
    
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text
#################################################################
def wrap_text_preserve_newlines(text, width=300):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')
    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text
########################################################################
# Text Splitter
def text_split(ducuments):
    text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=20)
    docs = text_splitter.split_documents(documents)
    return docs
######################################################################
# Embeddings
def embeddings_f():
    embeddings = HuggingFaceEmbeddings()
    return embeddings
##########################################################################
#upload pdf    
def pdf_upload():
    pdf_docs=st.file_uploader("Upload your PDF Files",accept_multiple_files=True)
    return pdf_docs
##############################################################################
def show(text):
    text_show=st.write(text)
    return text_show
###########################################################################
def upload_txt_file():
  # ایجاد یک ورودی برای آپلود فایل txt
  uploaded_file = st.file_uploader("لطفا یک فایل txt را انتخاب کنید", type="txt")
  # بررسی اینکه آیا فایلی آپلود شده است یا خیر
  if uploaded_file is not None:
    # باز کردن فایل با انکد UTF-8 و قرار دادن محتوای آن در یک متغیر به نام text
    with open(uploaded_file) as f:
      text = f.read()
    # برگرداندن متغیر text
    return text
  else:
    # برگرداندن یک پیام خطا اگر فایلی آپلود نشده باشد
    return "لطفا یک فایل txt را آپلود کنید"
###############################################################################
def find_most_similar_paragraph(input_sentence):
    try:
        # Open the text file for reading
        with open("d1.txt", 'r', encoding='utf-8') as file:
            text = file.read()

        # Split the text into paragraphs
        paragraphs = text.split("\n\n")  # Assuming paragraphs are separated by at least one empty line

        # Initialize variables to keep track of similarity
        max_similarity = 0
        most_similar_paragraph = ""

        # Compare input sentence with each paragraph
        for paragraph in paragraphs:
            # Preprocess the paragraph (convert to lowercase and remove extra spaces)
            cleaned_paragraph = paragraph.lower().strip()

            # Calculate similarity (you can use other similarity metrics as well)
            similarity = len(set(input_sentence.lower().split()) & set(cleaned_paragraph.split())) / len(set(input_sentence.lower().split()))

            # Update max similarity if needed
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_paragraph = paragraph

        return most_similar_paragraph
    except FileNotFoundError:
        return "فایل مورد نظر یافت نشد."




# Call the function
########################################################################################################


def main():
    st.set_page_config(page_title="Multiple pdf chat", page_icon=":books:")
    #chunk_size=st.text_input("enter a chunk size:")
    #chunk_overlap=st.text_input("enter a chunk overlap:")
    
    if st.button("build model"):
        with st.spinner("waiting"):
            import app2
            
            
            st.write("compelete build model") 

    query=st.text_input("enter a question:")
    if st.button("answer your question"):
      with st.spinner("waiting"):
          from app2 import db
          from app2 import load_model
          from app2 import find_help
          out=load_model(query,db)
          st.write(out)
          #st.write("-------------------------------------------------")
          st.write(query)
          st.write("************************************************")
          with open('d1.txt', 'w', encoding='utf-8') as file:
            file.write("")
            file.write(out)
          result = find_most_similar_paragraph(query)
          #st.write("************************************************")
          st.write(result)
          #find_help(out)
        ############### 
    with st.sidebar:
        uploaded_file = st.file_uploader("فایل را آپلود کنید", type=["csv", "txt"])
        #st.subheader("your document")
        #source_path=show_path()
        #if st.button("copy process"):
            #with st.spinner("processing"):
                #app1.copy_file(source_path)
    
      
        
                   
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    main()