pratikshahp commited on
Commit
cb550f3
·
verified ·
1 Parent(s): 78df8e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -96
app.py CHANGED
@@ -1,44 +1,29 @@
1
- import os
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
  import zipfile
5
  import io
6
- from transformers import BertForQuestionAnswering, BertTokenizer
 
7
  from sentence_transformers import SentenceTransformer
 
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from bs4 import BeautifulSoup
11
- import chromadb
12
- from chromadb.utils import embedding_functions
13
- from chromadb.utils.database import VectorDatabase
14
-
15
- # Ensure pysqlite3 is imported and used
16
- import pysqlite3
17
- import pysqlite3.dbapi2 as sqlite3
18
- os.environ["SQLITE_LIBRARY_PATH"] = pysqlite3.__file__
19
-
20
- # Load Hugging Face model and tokenizer
21
- model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
22
- qa_model = BertForQuestionAnswering.from_pretrained(model_name)
23
- qa_tokenizer = BertTokenizer.from_pretrained(model_name)
24
-
25
- # Function to get response from Hugging Face QA model
26
- def get_llm_response(question, context):
27
  try:
28
- inputs = qa_tokenizer.encode_plus(question, context, return_tensors='pt')
29
- with torch.no_grad():
30
- outputs = qa_model(**inputs)
31
- answer_start_scores = outputs.start_logits
32
- answer_end_scores = outputs.end_logits
33
-
34
- answer_start = torch.argmax(answer_start_scores)
35
- answer_end = torch.argmax(answer_end_scores) + 1
36
- answer = qa_tokenizer.convert_tokens_to_string(
37
- qa_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])
38
- )
39
- return answer
40
  except Exception as e:
41
- st.error(f"Error occurred while getting response from QA model: {e}")
42
  return ""
43
 
44
  # Function to extract text from PDF file
@@ -49,8 +34,8 @@ def extract_text_from_pdf(file):
49
  for page in doc:
50
  text += page.get_text()
51
  return text
52
- except Exception as e:
53
- st.error(f"Error occurred while processing PDF: {e}")
54
  return ""
55
 
56
  # Function to extract text from HTML file
@@ -59,7 +44,7 @@ def extract_text_from_html(file):
59
  soup = BeautifulSoup(file, 'html.parser')
60
  return soup.get_text()
61
  except Exception as e:
62
- st.error(f"Error occurred while processing HTML: {e}")
63
  return ""
64
 
65
  # Function to extract text from text file
@@ -67,82 +52,85 @@ def extract_text_from_txt(file):
67
  try:
68
  return file.read().decode("utf-8")
69
  except Exception as e:
70
- st.error(f"Error occurred while processing text file: {e}")
71
  return ""
72
 
73
  # Main function
74
  def main():
 
75
  st.title("ZIP File Chatbot")
76
 
 
77
  st.sidebar.title("Upload ZIP File")
78
  uploaded_file = st.sidebar.file_uploader("Choose a ZIP file", type=['zip'])
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  prompt = st.text_input("Ask a Question", "")
81
 
 
82
  submitted = st.button("Submit")
83
 
84
  if submitted:
85
- if uploaded_file is not None:
86
- bytes_data = uploaded_file.read()
87
- zip_file = io.BytesIO(bytes_data)
88
-
89
- extracted_texts = []
90
- with zipfile.ZipFile(zip_file, 'r') as z:
91
- for file_info in z.infolist():
92
- with z.open(file_info) as file:
93
- if file_info.filename.endswith('.pdf'):
94
- pdf_text = extract_text_from_pdf(file.read())
95
- if pdf_text:
96
- extracted_texts.append(pdf_text)
97
- elif file_info.filename.endswith('.html') or file_info.filename.endswith('.htm'):
98
- html_text = extract_text_from_html(file.read())
99
- if html_text:
100
- extracted_texts.append(html_text)
101
- elif file_info.filename.endswith('.txt'):
102
- txt_text = extract_text_from_txt(file.read())
103
- if txt_text:
104
- extracted_texts.append(txt_text)
105
-
106
- combined_text = "\n".join(extracted_texts)
107
-
108
- if combined_text:
109
- try:
110
- embeddings = HuggingFaceEmbeddings()
111
-
112
- text_splitter = RecursiveCharacterTextSplitter(
113
- chunk_size=1000,
114
- chunk_overlap=20,
115
- length_function=len
116
- )
117
- chunks = text_splitter.split_text(combined_text)
118
-
119
- # Initialize ChromaDB
120
- db = VectorDatabase(name="document_collection")
121
- embedding_function = embedding_functions.EmbeddingFunction(lambda x: embeddings.encode(x))
122
-
123
- # Insert vectors into ChromaDB
124
- for chunk in chunks:
125
- vector = embedding_function(chunk)
126
- db.insert({"text": chunk, "vector": vector})
127
-
128
- st.write("Embeddings stored successfully in ChromaDB.")
129
- st.write(f"Collection name: document_collection")
130
-
131
- if prompt:
132
- # Search similar vectors in ChromaDB
133
- query_vector = embedding_function(prompt)
134
- results = db.search({"vector": query_vector})
135
-
136
- st.write(results)
137
- if results:
138
- text = results[0]["text"]
139
- response = get_llm_response(prompt, text)
140
- st.subheader("Generated Answer:")
141
- st.write(response)
142
- else:
143
- st.warning("No similar documents found.")
144
- except Exception as e:
145
- st.error(f"Error occurred during text processing: {e}")
146
 
147
  if __name__ == "__main__":
148
  main()
 
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
  import zipfile
4
  import io
5
+ import os
6
+ from transformers import BartForConditionalGeneration, BartTokenizer
7
  from sentence_transformers import SentenceTransformer
8
+ from langchain.vectorstores import Chroma
9
  from langchain.embeddings import HuggingFaceEmbeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from bs4 import BeautifulSoup
12
+
13
+ # Load Hugging Face BART model and tokenizer
14
+ model_name = "facebook/bart-large-cnn"
15
+ bart_model = BartForConditionalGeneration.from_pretrained(model_name)
16
+ bart_tokenizer = BartTokenizer.from_pretrained(model_name)
17
+
18
+ # Function to get response from BART model
19
+ def get_llm_response(input_prompt, context, question):
 
 
 
 
 
 
 
 
20
  try:
21
+ inputs = bart_tokenizer.encode(f"{input_prompt} {context} Question: {question}", return_tensors="pt", max_length=1024, truncation=True)
22
+ summary_ids = bart_model.generate(inputs, max_length=200, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
23
+ response = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
24
+ return response
 
 
 
 
 
 
 
 
25
  except Exception as e:
26
+ st.error(f"Error occurred while getting response from BART model: {e}")
27
  return ""
28
 
29
  # Function to extract text from PDF file
 
34
  for page in doc:
35
  text += page.get_text()
36
  return text
37
+ except fitz.fitz.PDFError as e:
38
+ print(f"Error occurred while processing PDF: {e}")
39
  return ""
40
 
41
  # Function to extract text from HTML file
 
44
  soup = BeautifulSoup(file, 'html.parser')
45
  return soup.get_text()
46
  except Exception as e:
47
+ print(f"Error occurred while processing HTML: {e}")
48
  return ""
49
 
50
  # Function to extract text from text file
 
52
  try:
53
  return file.read().decode("utf-8")
54
  except Exception as e:
55
+ print(f"Error occurred while processing text file: {e}")
56
  return ""
57
 
58
  # Main function
59
  def main():
60
+ # Set title and description
61
  st.title("ZIP File Chatbot")
62
 
63
+ # Create a sidebar for file upload
64
  st.sidebar.title("Upload ZIP File")
65
  uploaded_file = st.sidebar.file_uploader("Choose a ZIP file", type=['zip'])
66
 
67
+ if uploaded_file is not None:
68
+ # Read the uploaded file as a byte stream
69
+ bytes_data = uploaded_file.read()
70
+ zip_file = io.BytesIO(bytes_data)
71
+
72
+ # Extract ZIP file contents
73
+ extracted_texts = []
74
+ with zipfile.ZipFile(zip_file, 'r') as z:
75
+ for file_info in z.infolist():
76
+ with z.open(file_info) as file:
77
+ if file_info.filename.endswith('.pdf'):
78
+ pdf_text = extract_text_from_pdf(file.read())
79
+ if pdf_text:
80
+ extracted_texts.append(pdf_text)
81
+ elif file_info.filename.endswith('.html') or file_info.filename.endswith('.htm'):
82
+ html_text = extract_text_from_html(file.read())
83
+ if html_text:
84
+ extracted_texts.append(html_text)
85
+ elif file_info.filename.endswith('.txt'):
86
+ txt_text = extract_text_from_txt(file.read())
87
+ if txt_text:
88
+ extracted_texts.append(txt_text)
89
+
90
+ # Combine extracted texts
91
+ combined_text = "\n".join(extracted_texts)
92
+ if combined_text:
93
+ try:
94
+ # Create embeddings
95
+ embeddings = HuggingFaceEmbeddings()
96
+
97
+ # Split text into chunks
98
+ text_splitter = RecursiveCharacterTextSplitter(
99
+ chunk_size=1000,
100
+ chunk_overlap=20,
101
+ length_function=len,
102
+ is_separator_regex=False,
103
+ )
104
+ chunks = text_splitter.create_documents([combined_text])
105
+
106
+ # Store chunks in ChromaDB
107
+ persist_directory = 'file_embeddings'
108
+ vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
109
+ vectordb.persist() # Persist ChromaDB
110
+ st.write("Embeddings stored successfully in ChromaDB.")
111
+ st.write(f"Persist directory: {persist_directory}")
112
+
113
+ # Load persisted Chroma database
114
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
115
+ st.write(vectordb)
116
+ except Exception as e:
117
+ st.error(f"Error occurred during text processing: {e}")
118
+
119
+ # Text input for prompt
120
  prompt = st.text_input("Ask a Question", "")
121
 
122
+ # Submit button
123
  submitted = st.button("Submit")
124
 
125
  if submitted:
126
+ if prompt:
127
+ docs = vectordb.similarity_search(prompt)
128
+ st.write(docs[0])
129
+ text = docs[0].page_content
130
+ input_prompt = "You are an expert in understanding text contents. You will receive input files and you will have to answer questions based on the input files."
131
+ response = get_llm_response(input_prompt, text, prompt)
132
+ st.subheader("Generated Answer:")
133
+ st.write(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  if __name__ == "__main__":
136
  main()