ZeeAI1 commited on
Commit
28296a9
·
verified ·
1 Parent(s): f695ad6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -49
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import streamlit as st
3
  import pdfplumber
4
- from concurrent.futures import ThreadPoolExecutor
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.vectorstores import FAISS
@@ -13,8 +12,7 @@ st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon
13
  # Load the summarization pipeline model
14
  @st.cache_resource
15
  def load_summarization_pipeline():
16
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
- return summarizer
18
 
19
  summarizer = load_summarization_pipeline()
20
 
@@ -41,8 +39,7 @@ LANGUAGES = {
41
  @st.cache_data
42
  def get_text_chunks(text):
43
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
44
- chunks = text_splitter.split_text(text)
45
- return chunks
46
 
47
  # Initialize embedding function
48
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@@ -50,11 +47,7 @@ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all
50
  # Create a FAISS vector store with embeddings
51
  @st.cache_resource
52
  def load_or_create_vector_store(text_chunks):
53
- if not text_chunks:
54
- st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
55
- return None
56
- vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
57
- return vector_store
58
 
59
  # Helper function to process a single PDF
60
  def process_single_pdf(file_path):
@@ -73,57 +66,67 @@ def process_single_pdf(file_path):
73
  def load_pdfs_with_progress(folder_path):
74
  if not os.path.exists(folder_path):
75
  st.error(f"The folder '{folder_path}' does not exist. Please create it and add PDF files.")
76
- st.session_state['vector_store'] = None
77
- st.session_state['loading'] = False
78
- return
79
 
80
  all_text = ""
81
  pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
82
- num_files = len(pdf_files)
83
-
84
- if num_files == 0:
85
  st.error("No PDF files found in the specified folder.")
86
- st.session_state['vector_store'] = None
87
- st.session_state['loading'] = False
88
- return
89
 
90
  st.markdown("### Loading data...")
91
  progress_bar = st.progress(0)
92
- status_text = st.empty()
93
 
94
- processed_count = 0
95
-
96
- for file_path in pdf_files:
97
- result = process_single_pdf(file_path)
98
- all_text += result
99
- processed_count += 1
100
- progress_percentage = int((processed_count / num_files) * 100)
101
- progress_bar.progress(processed_count / num_files)
102
- status_text.text(f"Loading documents: {progress_percentage}% completed")
103
 
104
  progress_bar.empty()
105
- status_text.text("Document loading completed!")
106
-
107
- if all_text:
108
- text_chunks = get_text_chunks(all_text)
109
- vector_store = load_or_create_vector_store(text_chunks)
110
- st.session_state['vector_store'] = vector_store
111
- else:
112
- st.session_state['vector_store'] = None
113
-
114
- st.session_state['loading'] = False
115
 
116
  # Generate summary based on retrieved text
117
- def generate_summary_with_huggingface(query, retrieved_text):
118
- summarization_input = f"{query} Related information:{retrieved_text}"
119
- max_input_length = 1024
120
- summarization_input = summarization_input[:max_input_length]
121
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
122
  return summary[0]["summary_text"]
123
 
124
- # Generate response for user query
125
- def user_input(user_question):
126
- vector_store = st.session_state.get('vector_store')
127
- if vector_store is None:
128
- return "The app is still loading documents or no documents were successfully loaded."
129
- docs = vector_store.simi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import streamlit as st
3
  import pdfplumber
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.vectorstores import FAISS
 
12
  # Load the summarization pipeline model
13
  @st.cache_resource
14
  def load_summarization_pipeline():
15
+ return pipeline("summarization", model="facebook/bart-large-cnn")
 
16
 
17
  summarizer = load_summarization_pipeline()
18
 
 
39
  @st.cache_data
40
  def get_text_chunks(text):
41
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
42
+ return text_splitter.split_text(text)
 
43
 
44
  # Initialize embedding function
45
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
47
  # Create a FAISS vector store with embeddings
48
  @st.cache_resource
49
  def load_or_create_vector_store(text_chunks):
50
+ return FAISS.from_texts(text_chunks, embedding=embedding_function) if text_chunks else None
 
 
 
 
51
 
52
  # Helper function to process a single PDF
53
  def process_single_pdf(file_path):
 
66
  def load_pdfs_with_progress(folder_path):
67
  if not os.path.exists(folder_path):
68
  st.error(f"The folder '{folder_path}' does not exist. Please create it and add PDF files.")
69
+ return None
 
 
70
 
71
  all_text = ""
72
  pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
73
+ if not pdf_files:
 
 
74
  st.error("No PDF files found in the specified folder.")
75
+ return None
 
 
76
 
77
  st.markdown("### Loading data...")
78
  progress_bar = st.progress(0)
 
79
 
80
+ for i, file_path in enumerate(pdf_files):
81
+ all_text += process_single_pdf(file_path)
82
+ progress_bar.progress((i + 1) / len(pdf_files))
 
 
 
 
 
 
83
 
84
  progress_bar.empty()
85
+ return load_or_create_vector_store(get_text_chunks(all_text)) if all_text else None
 
 
 
 
 
 
 
 
 
86
 
87
  # Generate summary based on retrieved text
88
+ def generate_summary(query, retrieved_text):
89
+ summarization_input = f"{query} Related information:{retrieved_text}"[:1024]
 
 
90
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
91
  return summary[0]["summary_text"]
92
 
93
+ # Translate text to selected language
94
+ def translate_text(text, target_lang):
95
+ translation_tokenizer.tgt_lang = target_lang
96
+ encoded_text = translation_tokenizer(text, return_tensors="pt")
97
+ generated_tokens = translation_model.generate(**encoded_text)
98
+ return translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
99
+
100
+ # Main function to run the Streamlit app
101
+ def main():
102
+ st.markdown(
103
+ """
104
+ <h1 style="font-size:30px; text-align: center;">
105
+ 📄 JusticeCompass: Your AI-Powered Legal Navigator for Swift, Accurate Guidance.
106
+ </h1>
107
+ """,
108
+ unsafe_allow_html=True
109
+ )
110
+
111
+ if "vector_store" not in st.session_state:
112
+ st.session_state["vector_store"] = load_pdfs_with_progress('documents1')
113
+ if st.session_state["vector_store"] is None:
114
+ return
115
+
116
+ # Prompt input
117
+ user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
118
+
119
+ # Language selection dropdown
120
+ selected_language = st.selectbox("Select output language:", list(LANGUAGES.keys()))
121
+
122
+ if user_question and st.button("Get Response"):
123
+ with st.spinner("Generating response..."):
124
+ docs = st.session_state["vector_store"].similarity_search(user_question)
125
+ context_text = " ".join([doc.page_content for doc in docs])
126
+ answer = generate_summary(user_question, context_text)
127
+ translated_answer = translate_text(answer, LANGUAGES[selected_language])
128
+ st.markdown(f"**🤖 AI ({selected_language}):** {translated_answer}")
129
+
130
+ if __name__ == "__main__":
131
+ main()
132
+