Spaces:
Sleeping
Sleeping
Navneet Sai commited on
Commit ·
376e7ad
1
Parent(s): 9ba2bd3
Switch from PyMuPDF to PyPDF for HF Compatibility
Browse files- app.py +5 -6
- requirements.txt +4 -5
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import tempfile
|
|
| 8 |
from typing import Optional
|
| 9 |
|
| 10 |
import chromadb
|
| 11 |
-
import
|
| 12 |
import gradio as gr
|
| 13 |
from chromadb.utils import embedding_functions
|
| 14 |
from openai import OpenAI
|
|
@@ -28,12 +28,11 @@ current_chunks = []
|
|
| 28 |
|
| 29 |
|
| 30 |
def extract_text_from_pdf(file_path: str) -> str:
|
| 31 |
-
"""Extract text from PDF using
|
| 32 |
-
|
| 33 |
text = ""
|
| 34 |
-
for page in
|
| 35 |
-
text += page.
|
| 36 |
-
doc.close()
|
| 37 |
return text
|
| 38 |
|
| 39 |
|
|
|
|
| 8 |
from typing import Optional
|
| 9 |
|
| 10 |
import chromadb
|
| 11 |
+
from pypdf import PdfReader # PyMuPDF
|
| 12 |
import gradio as gr
|
| 13 |
from chromadb.utils import embedding_functions
|
| 14 |
from openai import OpenAI
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def extract_text_from_pdf(file_path: str) -> str:
|
| 31 |
+
"""Extract text from PDF using pypdf."""
|
| 32 |
+
reader = PdfReader(file_path)
|
| 33 |
text = ""
|
| 34 |
+
for page in reader.pages:
|
| 35 |
+
text += page.extract_text() or ""
|
|
|
|
| 36 |
return text
|
| 37 |
|
| 38 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
PyMuPDF==1.23.8
|
|
|
|
| 1 |
+
chromadb
|
| 2 |
+
sentence-transformers
|
| 3 |
+
openai
|
| 4 |
+
pypdf
|
|
|