Spaces:

wahab5763
/

LawTest

Sleeping

App Files Files Community

wahab5763 commited on Nov 9, 2024

Commit

00dab9c

verified ·

1 Parent(s): 01879a7

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -34

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import requests
 import streamlit as st
 from io import BytesIO
 from PyPDF2 import PdfReader
@@ -20,41 +19,33 @@ def load_summarization_pipeline():
 summarizer = load_summarization_pipeline()
-# Dictionary of Hugging Face PDF URLs grouped by folders
-PDF_FOLDERS = {
-    "PPC and Administration": [
-        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/PPC%20and%20Administration/Pakistan%20Penal%20Code.pdf",
-        "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/PPC%20and%20Administration/administrator92ada0936848e501425591b4ad0cd417.pdf"
-    ]
-}
-# Helper function to convert Hugging Face blob URLs to direct download URLs
-def get_huggingface_raw_url(url):
-    if "huggingface.co" in url and "/blob/" in url:
-        return url.replace("/blob/", "/resolve/")
-    return url
-# Fetch and extract text from all PDFs in specified folders
-def fetch_pdf_text_from_folders(pdf_folders):
     all_text = ""
-    for folder_name, urls in pdf_folders.items():
-        folder_text = f"\n[Folder: {folder_name}]\n"
-        for url in urls:
-            raw_url = get_huggingface_raw_url(url)
-            try:
-                response = requests.get(raw_url)
-                response.raise_for_status()
-                pdf_file = BytesIO(response.content)
-                pdf_reader = PdfReader(pdf_file)
                 for page in pdf_reader.pages:
                     page_text = page.extract_text()
                     if page_text:
-                        folder_text += page_text
-            except requests.RequestException as e:
-                st.error(f"Failed to fetch PDF from URL: {url} - {e}")
-            except Exception as e:
-                st.error(f"Failed to read PDF from URL {url}: {e}")
-        all_text += folder_text
     return all_text
 # Split text into manageable chunks
@@ -71,7 +62,7 @@ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all
 @st.cache_resource
 def load_or_create_vector_store(text_chunks):
     if not text_chunks:
-        st.error("No valid text chunks found to create a vector store. Please check your PDF URLs or file content.")
         return None
     vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
     return vector_store
@@ -95,7 +86,10 @@ def user_input(user_question, vector_store):
 # Main function to run the Streamlit app
 def main():
     st.title("📄 Gen AI Lawyers Guide")
-    raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
     text_chunks = get_text_chunks(raw_text)
     vector_store = load_or_create_vector_store(text_chunks)

 import os
 import streamlit as st
 from io import BytesIO
 from PyPDF2 import PdfReader
 summarizer = load_summarization_pipeline()
+# Helper function to extract text from PDFs in a local folder
+def fetch_pdf_text_from_folder(folder_path):
     all_text = ""
+    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
+    total_files = len(pdf_files)
+    if total_files == 0:
+        st.warning("No PDF files found in the folder.")
+        return ""
+    progress_bar = st.progress(0)
+    for index, file_name in enumerate(pdf_files):
+        try:
+            file_path = os.path.join(folder_path, file_name)
+            with open(file_path, 'rb') as file:
+                pdf_reader = PdfReader(file)
                 for page in pdf_reader.pages:
                     page_text = page.extract_text()
                     if page_text:
+                        all_text += f"\n[File: {file_name}]\n{page_text}"
+        except Exception as e:
+            st.error(f"Failed to read PDF file {file_name}: {e}")
+        # Update the progress bar
+        progress_percentage = int(((index + 1) / total_files) * 100)
+        progress_bar.progress(progress_percentage)
     return all_text
 # Split text into manageable chunks
 @st.cache_resource
 def load_or_create_vector_store(text_chunks):
     if not text_chunks:
+        st.error("No valid text chunks found to create a vector store. Please check your PDF files or content.")
         return None
     vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
     return vector_store
 # Main function to run the Streamlit app
 def main():
     st.title("📄 Gen AI Lawyers Guide")
+    st.info("Loading data from the 'law-docs' folder...")
+    folder_path = "law-docs"
+    raw_text = fetch_pdf_text_from_folder(folder_path)
     text_chunks = get_text_chunks(raw_text)
     vector_store = load_or_create_vector_store(text_chunks)