Spaces:

pratikshahp
/

rag-zip-file

Sleeping

App Files Files Community

pratikshahp commited on May 31, 2024

Commit

cb550f3

verified ·

1 Parent(s): 78df8e6

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -96

app.py CHANGED Viewed

@@ -1,44 +1,29 @@
-import os
 import streamlit as st
 import fitz  # PyMuPDF
 import zipfile
 import io
-from transformers import BertForQuestionAnswering, BertTokenizer
 from sentence_transformers import SentenceTransformer
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from bs4 import BeautifulSoup
-import chromadb
-from chromadb.utils import embedding_functions
-from chromadb.utils.database import VectorDatabase
-# Ensure pysqlite3 is imported and used
-import pysqlite3
-import pysqlite3.dbapi2 as sqlite3
-os.environ["SQLITE_LIBRARY_PATH"] = pysqlite3.__file__
-# Load Hugging Face model and tokenizer
-model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
-qa_model = BertForQuestionAnswering.from_pretrained(model_name)
-qa_tokenizer = BertTokenizer.from_pretrained(model_name)
-# Function to get response from Hugging Face QA model
-def get_llm_response(question, context):
     try:
-        inputs = qa_tokenizer.encode_plus(question, context, return_tensors='pt')
-        with torch.no_grad():
-            outputs = qa_model(**inputs)
-            answer_start_scores = outputs.start_logits
-            answer_end_scores = outputs.end_logits
-        answer_start = torch.argmax(answer_start_scores)
-        answer_end = torch.argmax(answer_end_scores) + 1
-        answer = qa_tokenizer.convert_tokens_to_string(
-            qa_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])
-        )
-        return answer
     except Exception as e:
-        st.error(f"Error occurred while getting response from QA model: {e}")
         return ""
 # Function to extract text from PDF file
@@ -49,8 +34,8 @@ def extract_text_from_pdf(file):
             for page in doc:
                 text += page.get_text()
             return text
-    except Exception as e:
-        st.error(f"Error occurred while processing PDF: {e}")
         return ""
 # Function to extract text from HTML file
@@ -59,7 +44,7 @@ def extract_text_from_html(file):
         soup = BeautifulSoup(file, 'html.parser')
         return soup.get_text()
     except Exception as e:
-        st.error(f"Error occurred while processing HTML: {e}")
         return ""
 # Function to extract text from text file
@@ -67,82 +52,85 @@ def extract_text_from_txt(file):
     try:
         return file.read().decode("utf-8")
     except Exception as e:
-        st.error(f"Error occurred while processing text file: {e}")
         return ""
 # Main function
 def main():
     st.title("ZIP File Chatbot")
     st.sidebar.title("Upload ZIP File")
     uploaded_file = st.sidebar.file_uploader("Choose a ZIP file", type=['zip'])
     prompt = st.text_input("Ask a Question", "")
     submitted = st.button("Submit")
     if submitted:
-        if uploaded_file is not None:
-            bytes_data = uploaded_file.read()
-            zip_file = io.BytesIO(bytes_data)
-            extracted_texts = []
-            with zipfile.ZipFile(zip_file, 'r') as z:
-                for file_info in z.infolist():
-                    with z.open(file_info) as file:
-                        if file_info.filename.endswith('.pdf'):
-                            pdf_text = extract_text_from_pdf(file.read())
-                            if pdf_text:
-                                extracted_texts.append(pdf_text)
-                        elif file_info.filename.endswith('.html') or file_info.filename.endswith('.htm'):
-                            html_text = extract_text_from_html(file.read())
-                            if html_text:
-                                extracted_texts.append(html_text)
-                        elif file_info.filename.endswith('.txt'):
-                            txt_text = extract_text_from_txt(file.read())
-                            if txt_text:
-                                extracted_texts.append(txt_text)
-            combined_text = "\n".join(extracted_texts)
-            if combined_text:
-                try:
-                    embeddings = HuggingFaceEmbeddings()
-                    text_splitter = RecursiveCharacterTextSplitter(
-                        chunk_size=1000,
-                        chunk_overlap=20,
-                        length_function=len
-                    )
-                    chunks = text_splitter.split_text(combined_text)
-                    # Initialize ChromaDB
-                    db = VectorDatabase(name="document_collection")
-                    embedding_function = embedding_functions.EmbeddingFunction(lambda x: embeddings.encode(x))
-                    # Insert vectors into ChromaDB
-                    for chunk in chunks:
-                        vector = embedding_function(chunk)
-                        db.insert({"text": chunk, "vector": vector})
-                    st.write("Embeddings stored successfully in ChromaDB.")
-                    st.write(f"Collection name: document_collection")
-                    if prompt:
-                        # Search similar vectors in ChromaDB
-                        query_vector = embedding_function(prompt)
-                        results = db.search({"vector": query_vector})
-                        st.write(results)
-                        if results:
-                            text = results[0]["text"]
-                            response = get_llm_response(prompt, text)
-                            st.subheader("Generated Answer:")
-                            st.write(response)
-                        else:
-                            st.warning("No similar documents found.")
-                except Exception as e:
-                    st.error(f"Error occurred during text processing: {e}")
 if __name__ == "__main__":
     main()

 import streamlit as st
 import fitz  # PyMuPDF
 import zipfile
 import io
+import os
+from transformers import BartForConditionalGeneration, BartTokenizer
 from sentence_transformers import SentenceTransformer
+from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from bs4 import BeautifulSoup
+# Load Hugging Face BART model and tokenizer
+model_name = "facebook/bart-large-cnn"
+bart_model = BartForConditionalGeneration.from_pretrained(model_name)
+bart_tokenizer = BartTokenizer.from_pretrained(model_name)
+# Function to get response from BART model
+def get_llm_response(input_prompt, context, question):
     try:
+        inputs = bart_tokenizer.encode(f"{input_prompt} {context} Question: {question}", return_tensors="pt", max_length=1024, truncation=True)
+        summary_ids = bart_model.generate(inputs, max_length=200, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
+        response = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return response
     except Exception as e:
+        st.error(f"Error occurred while getting response from BART model: {e}")
         return ""
 # Function to extract text from PDF file
             for page in doc:
                 text += page.get_text()
             return text
+    except fitz.fitz.PDFError as e:
+        print(f"Error occurred while processing PDF: {e}")
         return ""
 # Function to extract text from HTML file
         soup = BeautifulSoup(file, 'html.parser')
         return soup.get_text()
     except Exception as e:
+        print(f"Error occurred while processing HTML: {e}")
         return ""
 # Function to extract text from text file
     try:
         return file.read().decode("utf-8")
     except Exception as e:
+        print(f"Error occurred while processing text file: {e}")
         return ""
 # Main function
 def main():
+    # Set title and description
     st.title("ZIP File Chatbot")
+    # Create a sidebar for file upload
     st.sidebar.title("Upload ZIP File")
     uploaded_file = st.sidebar.file_uploader("Choose a ZIP file", type=['zip'])
+    if uploaded_file is not None:
+        # Read the uploaded file as a byte stream
+        bytes_data = uploaded_file.read()
+        zip_file = io.BytesIO(bytes_data)
+        # Extract ZIP file contents
+        extracted_texts = []
+        with zipfile.ZipFile(zip_file, 'r') as z:
+            for file_info in z.infolist():
+                with z.open(file_info) as file:
+                    if file_info.filename.endswith('.pdf'):
+                        pdf_text = extract_text_from_pdf(file.read())
+                        if pdf_text:
+                            extracted_texts.append(pdf_text)
+                    elif file_info.filename.endswith('.html') or file_info.filename.endswith('.htm'):
+                        html_text = extract_text_from_html(file.read())
+                        if html_text:
+                            extracted_texts.append(html_text)
+                    elif file_info.filename.endswith('.txt'):
+                        txt_text = extract_text_from_txt(file.read())
+                        if txt_text:
+                            extracted_texts.append(txt_text)
+        # Combine extracted texts
+        combined_text = "\n".join(extracted_texts)
+        if combined_text:
+            try:
+                # Create embeddings
+                embeddings = HuggingFaceEmbeddings()
+                # Split text into chunks
+                text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=1000,
+                    chunk_overlap=20,
+                    length_function=len,
+                    is_separator_regex=False,
+                )
+                chunks = text_splitter.create_documents([combined_text])
+                # Store chunks in ChromaDB
+                persist_directory = 'file_embeddings'
+                vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
+                vectordb.persist()  # Persist ChromaDB
+                st.write("Embeddings stored successfully in ChromaDB.")
+                st.write(f"Persist directory: {persist_directory}")
+                # Load persisted Chroma database
+                vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+                st.write(vectordb)
+            except Exception as e:
+                st.error(f"Error occurred during text processing: {e}")
+    # Text input for prompt
     prompt = st.text_input("Ask a Question", "")
+    # Submit button
     submitted = st.button("Submit")
     if submitted:
+        if prompt:
+            docs = vectordb.similarity_search(prompt)
+            st.write(docs[0])
+            text = docs[0].page_content
+            input_prompt = "You are an expert in understanding text contents. You will receive input files and you will have to answer questions based on the input files."
+            response = get_llm_response(input_prompt, text, prompt)
+            st.subheader("Generated Answer:")
+            st.write(response)
 if __name__ == "__main__":
     main()