Spaces:

akazmi
/

Legal2

Sleeping

akazmi commited on Nov 11, 2024

Commit

bb9ec18

verified ·

1 Parent(s): 036e3d9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-import PyPDF2
 import gradio as gr
 from groq import Groq
 from nltk.tokenize import sent_tokenize
@@ -15,14 +15,14 @@ def list_pdf_files():
     pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
     return pdf_files
-# Function to extract text and split into chunks
 def extract_text_from_pdf(file_path):
     try:
-        with open(file_path, 'rb') as file:
-            pdf_reader = PyPDF2.PdfReader(file)
-            full_text = ""
-            for page in pdf_reader.pages:
-                full_text += page.extract_text() or ""
         # Split the text into smaller chunks for RAG
         sentences = sent_tokenize(full_text)

 import os
+import pdfplumber
 import gradio as gr
 from groq import Groq
 from nltk.tokenize import sent_tokenize
     pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
     return pdf_files
+# Function to extract text and split into chunks using pdfplumber
 def extract_text_from_pdf(file_path):
     try:
+        full_text = ""
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                full_text += page_text
         # Split the text into smaller chunks for RAG
         sentences = sent_tokenize(full_text)