Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
import gradio as gr
|
| 4 |
from groq import Groq
|
| 5 |
from nltk.tokenize import sent_tokenize
|
|
@@ -15,14 +15,14 @@ def list_pdf_files():
|
|
| 15 |
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
|
| 16 |
return pdf_files
|
| 17 |
|
| 18 |
-
# Function to extract text and split into chunks
|
| 19 |
def extract_text_from_pdf(file_path):
|
| 20 |
try:
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
full_text +=
|
| 26 |
|
| 27 |
# Split the text into smaller chunks for RAG
|
| 28 |
sentences = sent_tokenize(full_text)
|
|
|
|
| 1 |
import os
|
| 2 |
+
import pdfplumber
|
| 3 |
import gradio as gr
|
| 4 |
from groq import Groq
|
| 5 |
from nltk.tokenize import sent_tokenize
|
|
|
|
| 15 |
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
|
| 16 |
return pdf_files
|
| 17 |
|
| 18 |
+
# Function to extract text and split into chunks using pdfplumber
|
| 19 |
def extract_text_from_pdf(file_path):
|
| 20 |
try:
|
| 21 |
+
full_text = ""
|
| 22 |
+
with pdfplumber.open(file_path) as pdf:
|
| 23 |
+
for page in pdf.pages:
|
| 24 |
+
page_text = page.extract_text() or ""
|
| 25 |
+
full_text += page_text
|
| 26 |
|
| 27 |
# Split the text into smaller chunks for RAG
|
| 28 |
sentences = sent_tokenize(full_text)
|