retire unoconv
Browse files- base_utils.py +96 -40
base_utils.py
CHANGED
|
@@ -242,41 +242,101 @@ def extract_text_from_pptx(file_path):
|
|
| 242 |
return "\n\n".join(text_content)
|
| 243 |
|
| 244 |
|
| 245 |
-
def
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
#
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
|
|
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
|
| 282 |
def convert_pdf_to_image(file):
|
|
@@ -302,17 +362,13 @@ def extract_text_from_docx(file_path):
|
|
| 302 |
|
| 303 |
def convert_doc_to_text(file_path):
|
| 304 |
try:
|
| 305 |
-
subprocess.run(
|
| 306 |
-
["
|
| 307 |
capture_output=True,
|
| 308 |
text=True,
|
| 309 |
check=True,
|
| 310 |
)
|
| 311 |
-
|
| 312 |
-
with open(txt_file_path, "r") as f:
|
| 313 |
-
text = f.read()
|
| 314 |
-
text = text.lstrip("\ufeff")
|
| 315 |
-
os.remove(txt_file_path)
|
| 316 |
return text
|
| 317 |
except subprocess.CalledProcessError as e:
|
| 318 |
print(f"Error converting {file_path} to text: {e}")
|
|
|
|
| 242 |
return "\n\n".join(text_content)
|
| 243 |
|
| 244 |
|
| 245 |
+
def is_meaningful_text(text: str) -> bool:
|
| 246 |
+
if not text or len(text) < 3:
|
| 247 |
+
return False
|
| 248 |
+
|
| 249 |
+
junk_patterns = [
|
| 250 |
+
r'^[^a-zA-Z]*$', # no letters
|
| 251 |
+
r'^\W+$', # only symbols
|
| 252 |
+
r'^.{1,2}$', # too short
|
| 253 |
+
]
|
| 254 |
+
if any(re.match(p, text) for p in junk_patterns):
|
| 255 |
+
return False
|
| 256 |
+
|
| 257 |
+
if re.search(r'[^\x20-\x7E]', text): # non-printables
|
| 258 |
+
return False
|
| 259 |
+
|
| 260 |
+
letters = sum(1 for c in text if c.isalpha())
|
| 261 |
+
return letters / len(text) >= 0.3
|
| 262 |
+
|
| 263 |
+
def extract_using_unicode_search(path: str) -> str:
|
| 264 |
+
with open(path, "rb") as file:
|
| 265 |
+
data = file.read()
|
| 266 |
+
|
| 267 |
+
text_blocks, current_text = [], b""
|
| 268 |
+
i = 0
|
| 269 |
+
while i < len(data) - 1:
|
| 270 |
+
b1, b2 = data[i], data[i + 1]
|
| 271 |
+
|
| 272 |
+
if 32 <= b1 <= 126 and b2 == 0: # UTF-16 pattern
|
| 273 |
+
current_text += bytes([b1])
|
| 274 |
+
i += 2
|
| 275 |
+
elif b1 == 0 and current_text:
|
| 276 |
+
try:
|
| 277 |
+
text = current_text.decode("ascii", errors="ignore").strip()
|
| 278 |
+
if is_meaningful_text(text):
|
| 279 |
+
text_blocks.append(text)
|
| 280 |
+
except:
|
| 281 |
+
pass
|
| 282 |
+
current_text = b""
|
| 283 |
+
i += 1
|
| 284 |
+
else:
|
| 285 |
+
if current_text:
|
| 286 |
+
try:
|
| 287 |
+
text = current_text.decode("ascii", errors="ignore").strip()
|
| 288 |
+
if is_meaningful_text(text):
|
| 289 |
+
text_blocks.append(text)
|
| 290 |
+
except:
|
| 291 |
+
pass
|
| 292 |
+
current_text = b""
|
| 293 |
+
i += 1
|
| 294 |
+
|
| 295 |
+
if current_text:
|
| 296 |
+
try:
|
| 297 |
+
text = current_text.decode("ascii", errors="ignore").strip()
|
| 298 |
+
if is_meaningful_text(text):
|
| 299 |
+
text_blocks.append(text)
|
| 300 |
+
except:
|
| 301 |
+
pass
|
| 302 |
+
|
| 303 |
+
# remove duplicates
|
| 304 |
+
unique, seen = [], set()
|
| 305 |
+
for block in text_blocks:
|
| 306 |
+
cleaned = re.sub(r"[^\w\s\.,;:!?\-]", "", block)
|
| 307 |
+
if cleaned not in seen and len(cleaned) > 5:
|
| 308 |
+
unique.append(block)
|
| 309 |
+
seen.add(cleaned)
|
| 310 |
|
| 311 |
+
return "\n".join(unique[:30]) if unique else "No text found"
|
| 312 |
|
| 313 |
+
|
| 314 |
+
def extract_text_from_ppt(file_path: str) -> str:
|
| 315 |
+
"""
|
| 316 |
+
Extract text from legacy PowerPoint (.ppt) files using Unicode pattern search.
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
file_path (str): Path to the .ppt file
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
str: Extracted text from the presentation, or None if extraction fails
|
| 323 |
+
|
| 324 |
+
Raises:
|
| 325 |
+
FileNotFoundError: If the file doesn't exist
|
| 326 |
+
ValueError: If the file is not a valid .ppt file
|
| 327 |
+
"""
|
| 328 |
+
if not os.path.exists(file_path):
|
| 329 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 330 |
+
|
| 331 |
+
if not file_path.lower().endswith(".ppt"):
|
| 332 |
+
raise ValueError(f"Unsupported file format: {file_path}. Only .ppt files are supported.")
|
| 333 |
+
|
| 334 |
+
try:
|
| 335 |
+
return extract_using_unicode_search(file_path)
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
print(f"Error extracting text from {file_path}: {e}")
|
| 339 |
+
return None
|
| 340 |
|
| 341 |
|
| 342 |
def convert_pdf_to_image(file):
|
|
|
|
| 362 |
|
| 363 |
def convert_doc_to_text(file_path):
|
| 364 |
try:
|
| 365 |
+
result = subprocess.run(
|
| 366 |
+
["antiword", file_path],
|
| 367 |
capture_output=True,
|
| 368 |
text=True,
|
| 369 |
check=True,
|
| 370 |
)
|
| 371 |
+
text = result.stdout.lstrip("\ufeff")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
return text
|
| 373 |
except subprocess.CalledProcessError as e:
|
| 374 |
print(f"Error converting {file_path} to text: {e}")
|