wahab5763 commited on
Commit
00dab9c
·
verified ·
1 Parent(s): 01879a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -34
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import requests
3
  import streamlit as st
4
  from io import BytesIO
5
  from PyPDF2 import PdfReader
@@ -20,41 +19,33 @@ def load_summarization_pipeline():
20
 
21
  summarizer = load_summarization_pipeline()
22
 
23
- # Dictionary of Hugging Face PDF URLs grouped by folders
24
- PDF_FOLDERS = {
25
- "PPC and Administration": [
26
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/PPC%20and%20Administration/Pakistan%20Penal%20Code.pdf",
27
- "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/PPC%20and%20Administration/administrator92ada0936848e501425591b4ad0cd417.pdf"
28
- ]
29
- }
30
-
31
- # Helper function to convert Hugging Face blob URLs to direct download URLs
32
- def get_huggingface_raw_url(url):
33
- if "huggingface.co" in url and "/blob/" in url:
34
- return url.replace("/blob/", "/resolve/")
35
- return url
36
-
37
- # Fetch and extract text from all PDFs in specified folders
38
- def fetch_pdf_text_from_folders(pdf_folders):
39
  all_text = ""
40
- for folder_name, urls in pdf_folders.items():
41
- folder_text = f"\n[Folder: {folder_name}]\n"
42
- for url in urls:
43
- raw_url = get_huggingface_raw_url(url)
44
- try:
45
- response = requests.get(raw_url)
46
- response.raise_for_status()
47
- pdf_file = BytesIO(response.content)
48
- pdf_reader = PdfReader(pdf_file)
 
 
 
 
49
  for page in pdf_reader.pages:
50
  page_text = page.extract_text()
51
  if page_text:
52
- folder_text += page_text
53
- except requests.RequestException as e:
54
- st.error(f"Failed to fetch PDF from URL: {url} - {e}")
55
- except Exception as e:
56
- st.error(f"Failed to read PDF from URL {url}: {e}")
57
- all_text += folder_text
 
 
58
  return all_text
59
 
60
  # Split text into manageable chunks
@@ -71,7 +62,7 @@ embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all
71
  @st.cache_resource
72
  def load_or_create_vector_store(text_chunks):
73
  if not text_chunks:
74
- st.error("No valid text chunks found to create a vector store. Please check your PDF URLs or file content.")
75
  return None
76
  vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
77
  return vector_store
@@ -95,7 +86,10 @@ def user_input(user_question, vector_store):
95
  # Main function to run the Streamlit app
96
  def main():
97
  st.title("📄 Gen AI Lawyers Guide")
98
- raw_text = fetch_pdf_text_from_folders(PDF_FOLDERS)
 
 
 
99
  text_chunks = get_text_chunks(raw_text)
100
  vector_store = load_or_create_vector_store(text_chunks)
101
 
 
1
  import os
 
2
  import streamlit as st
3
  from io import BytesIO
4
  from PyPDF2 import PdfReader
 
19
 
20
  summarizer = load_summarization_pipeline()
21
 
22
+ # Helper function to extract text from PDFs in a local folder
23
+ def fetch_pdf_text_from_folder(folder_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  all_text = ""
25
+ pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
26
+ total_files = len(pdf_files)
27
+
28
+ if total_files == 0:
29
+ st.warning("No PDF files found in the folder.")
30
+ return ""
31
+
32
+ progress_bar = st.progress(0)
33
+ for index, file_name in enumerate(pdf_files):
34
+ try:
35
+ file_path = os.path.join(folder_path, file_name)
36
+ with open(file_path, 'rb') as file:
37
+ pdf_reader = PdfReader(file)
38
  for page in pdf_reader.pages:
39
  page_text = page.extract_text()
40
  if page_text:
41
+ all_text += f"\n[File: {file_name}]\n{page_text}"
42
+ except Exception as e:
43
+ st.error(f"Failed to read PDF file {file_name}: {e}")
44
+
45
+ # Update the progress bar
46
+ progress_percentage = int(((index + 1) / total_files) * 100)
47
+ progress_bar.progress(progress_percentage)
48
+
49
  return all_text
50
 
51
  # Split text into manageable chunks
 
62
  @st.cache_resource
63
  def load_or_create_vector_store(text_chunks):
64
  if not text_chunks:
65
+ st.error("No valid text chunks found to create a vector store. Please check your PDF files or content.")
66
  return None
67
  vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
68
  return vector_store
 
86
  # Main function to run the Streamlit app
87
  def main():
88
  st.title("📄 Gen AI Lawyers Guide")
89
+ st.info("Loading data from the 'law-docs' folder...")
90
+
91
+ folder_path = "law-docs"
92
+ raw_text = fetch_pdf_text_from_folder(folder_path)
93
  text_chunks = get_text_chunks(raw_text)
94
  vector_store = load_or_create_vector_store(text_chunks)
95