akazmi commited on
Commit
bb9ec18
·
verified ·
1 Parent(s): 036e3d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- import PyPDF2
3
  import gradio as gr
4
  from groq import Groq
5
  from nltk.tokenize import sent_tokenize
@@ -15,14 +15,14 @@ def list_pdf_files():
15
  pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
16
  return pdf_files
17
 
18
- # Function to extract text and split into chunks
19
  def extract_text_from_pdf(file_path):
20
  try:
21
- with open(file_path, 'rb') as file:
22
- pdf_reader = PyPDF2.PdfReader(file)
23
- full_text = ""
24
- for page in pdf_reader.pages:
25
- full_text += page.extract_text() or ""
26
 
27
  # Split the text into smaller chunks for RAG
28
  sentences = sent_tokenize(full_text)
 
1
  import os
2
+ import pdfplumber
3
  import gradio as gr
4
  from groq import Groq
5
  from nltk.tokenize import sent_tokenize
 
15
  pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
16
  return pdf_files
17
 
18
+ # Function to extract text and split into chunks using pdfplumber
19
  def extract_text_from_pdf(file_path):
20
  try:
21
+ full_text = ""
22
+ with pdfplumber.open(file_path) as pdf:
23
+ for page in pdf.pages:
24
+ page_text = page.extract_text() or ""
25
+ full_text += page_text
26
 
27
  # Split the text into smaller chunks for RAG
28
  sentences = sent_tokenize(full_text)