Spaces:
Build error
Build error
Update utils/document_processor.py
Browse files- utils/document_processor.py +31 -1
utils/document_processor.py
CHANGED
|
@@ -154,8 +154,36 @@ class DocumentProcessor:
|
|
| 154 |
chunks = self._create_chunks(text)
|
| 155 |
return text, chunks
|
| 156 |
|
| 157 |
-
|
| 158 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
images = convert_from_bytes(open(file_path, 'rb').read())
|
| 160 |
text = ""
|
| 161 |
for page_num, image in enumerate(images, 1):
|
|
@@ -167,6 +195,8 @@ class DocumentProcessor:
|
|
| 167 |
st.error(f"Error processing PDF: {str(e)}")
|
| 168 |
raise
|
| 169 |
|
|
|
|
|
|
|
| 170 |
def _process_docx(self, file_path: str) -> str:
|
| 171 |
"""Extract text from DOCX files."""
|
| 172 |
try:
|
|
|
|
| 154 |
chunks = self._create_chunks(text)
|
| 155 |
return text, chunks
|
| 156 |
|
| 157 |
+
def __process_pdf(self, file_path: str) -> str:
|
| 158 |
try:
|
| 159 |
+
# Try some common Poppler installation paths
|
| 160 |
+
poppler_paths = [
|
| 161 |
+
"/usr/bin",
|
| 162 |
+
"/usr/local/bin",
|
| 163 |
+
"/opt/poppler/bin",
|
| 164 |
+
"/Library/Frameworks/Poppler.framework/Versions/Current/bin", # for macOS
|
| 165 |
+
]
|
| 166 |
+
|
| 167 |
+
# Find the first valid Poppler path
|
| 168 |
+
for poppler_dir in poppler_paths:
|
| 169 |
+
if os.path.exists(os.path.join(poppler_dir, "pdftoppm")):
|
| 170 |
+
break
|
| 171 |
+
else:
|
| 172 |
+
raise ValueError("Poppler not found in any of the common installation paths.")
|
| 173 |
+
|
| 174 |
+
# Update the PATH and LD_LIBRARY_PATH environment variables
|
| 175 |
+
os.environ["PATH"] = f"{poppler_dir}:{os.environ['PATH']}"
|
| 176 |
+
os.environ["LD_LIBRARY_PATH"] = f"{poppler_dir}:{os.environ.get('LD_LIBRARY_PATH', '')}"
|
| 177 |
+
|
| 178 |
+
# Test the Poppler installation
|
| 179 |
+
try:
|
| 180 |
+
subprocess.check_output(["pdftoppm", "-v"])
|
| 181 |
+
st.info("Poppler is installed and in the PATH.")
|
| 182 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 183 |
+
st.error("Unable to find Poppler. Please check the installation.")
|
| 184 |
+
raise
|
| 185 |
+
|
| 186 |
+
# Process the PDF file using pdf2image and Tesseract OCR
|
| 187 |
images = convert_from_bytes(open(file_path, 'rb').read())
|
| 188 |
text = ""
|
| 189 |
for page_num, image in enumerate(images, 1):
|
|
|
|
| 195 |
st.error(f"Error processing PDF: {str(e)}")
|
| 196 |
raise
|
| 197 |
|
| 198 |
+
|
| 199 |
+
|
| 200 |
def _process_docx(self, file_path: str) -> str:
|
| 201 |
"""Extract text from DOCX files."""
|
| 202 |
try:
|