not-lain commited on
Commit
7bc78fb
·
1 Parent(s): 2ebbfbc

retire unoconv

Browse files
Files changed (1) hide show
  1. base_utils.py +96 -40
base_utils.py CHANGED
@@ -242,41 +242,101 @@ def extract_text_from_pptx(file_path):
242
  return "\n\n".join(text_content)
243
 
244
 
245
- def extract_text_from_ppt(file_path):
246
- try:
247
- print("file_path = ", file_path)
248
- # Convert PPT to PPTX using unoconv
249
- pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
250
- subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
251
-
252
- # Extract text from PPTX
253
- presentation = Presentation(pptx_file_path)
254
- text_content = []
255
-
256
- for slide in presentation.slides:
257
- slide_text = []
258
- for shape in slide.shapes:
259
- if hasattr(shape, "text"):
260
- slide_text.append(shape.text)
261
- text_content.append("\n".join(slide_text))
262
-
263
- # Remove the converted PPTX file
264
- os.remove(pptx_file_path)
265
-
266
- out = "\n\n".join(text_content)
267
- return out
268
- except Exception as e:
269
- print(f"Error extracting text from PPT file: {e}")
270
- return "Error extracting text from PPT file"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
 
272
 
273
- # def extract_text_from_ppt_or_pptx(file_path):
274
- # if file_path.endswith(".pptx"):
275
- # return extract_text_from_pptx(file_path)
276
- # elif file_path.endswith(".ppt"):
277
- # return extract_text_from_ppt(file_path)
278
- # else:
279
- # return "Unsupported file type. Please provide a .ppt or .pptx file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
 
282
  def convert_pdf_to_image(file):
@@ -302,17 +362,13 @@ def extract_text_from_docx(file_path):
302
 
303
  def convert_doc_to_text(file_path):
304
  try:
305
- subprocess.run(
306
- ["unoconv", "--format", "txt", file_path],
307
  capture_output=True,
308
  text=True,
309
  check=True,
310
  )
311
- txt_file_path = file_path.replace(".doc", ".txt")
312
- with open(txt_file_path, "r") as f:
313
- text = f.read()
314
- text = text.lstrip("\ufeff")
315
- os.remove(txt_file_path)
316
  return text
317
  except subprocess.CalledProcessError as e:
318
  print(f"Error converting {file_path} to text: {e}")
 
242
  return "\n\n".join(text_content)
243
 
244
 
245
+ def is_meaningful_text(text: str) -> bool:
246
+ if not text or len(text) < 3:
247
+ return False
248
+
249
+ junk_patterns = [
250
+ r'^[^a-zA-Z]*$', # no letters
251
+ r'^\W+$', # only symbols
252
+ r'^.{1,2}$', # too short
253
+ ]
254
+ if any(re.match(p, text) for p in junk_patterns):
255
+ return False
256
+
257
+ if re.search(r'[^\x20-\x7E]', text): # non-printables
258
+ return False
259
+
260
+ letters = sum(1 for c in text if c.isalpha())
261
+ return letters / len(text) >= 0.3
262
+
263
+ def extract_using_unicode_search(path: str) -> str:
264
+ with open(path, "rb") as file:
265
+ data = file.read()
266
+
267
+ text_blocks, current_text = [], b""
268
+ i = 0
269
+ while i < len(data) - 1:
270
+ b1, b2 = data[i], data[i + 1]
271
+
272
+ if 32 <= b1 <= 126 and b2 == 0: # UTF-16 pattern
273
+ current_text += bytes([b1])
274
+ i += 2
275
+ elif b1 == 0 and current_text:
276
+ try:
277
+ text = current_text.decode("ascii", errors="ignore").strip()
278
+ if is_meaningful_text(text):
279
+ text_blocks.append(text)
280
+ except:
281
+ pass
282
+ current_text = b""
283
+ i += 1
284
+ else:
285
+ if current_text:
286
+ try:
287
+ text = current_text.decode("ascii", errors="ignore").strip()
288
+ if is_meaningful_text(text):
289
+ text_blocks.append(text)
290
+ except:
291
+ pass
292
+ current_text = b""
293
+ i += 1
294
+
295
+ if current_text:
296
+ try:
297
+ text = current_text.decode("ascii", errors="ignore").strip()
298
+ if is_meaningful_text(text):
299
+ text_blocks.append(text)
300
+ except:
301
+ pass
302
+
303
+ # remove duplicates
304
+ unique, seen = [], set()
305
+ for block in text_blocks:
306
+ cleaned = re.sub(r"[^\w\s\.,;:!?\-]", "", block)
307
+ if cleaned not in seen and len(cleaned) > 5:
308
+ unique.append(block)
309
+ seen.add(cleaned)
310
 
311
+ return "\n".join(unique[:30]) if unique else "No text found"
312
 
313
+
314
+ def extract_text_from_ppt(file_path: str) -> str:
315
+ """
316
+ Extract text from legacy PowerPoint (.ppt) files using Unicode pattern search.
317
+
318
+ Args:
319
+ file_path (str): Path to the .ppt file
320
+
321
+ Returns:
322
+ str: Extracted text from the presentation, or None if extraction fails
323
+
324
+ Raises:
325
+ FileNotFoundError: If the file doesn't exist
326
+ ValueError: If the file is not a valid .ppt file
327
+ """
328
+ if not os.path.exists(file_path):
329
+ raise FileNotFoundError(f"File not found: {file_path}")
330
+
331
+ if not file_path.lower().endswith(".ppt"):
332
+ raise ValueError(f"Unsupported file format: {file_path}. Only .ppt files are supported.")
333
+
334
+ try:
335
+ return extract_using_unicode_search(file_path)
336
+
337
+ except Exception as e:
338
+ print(f"Error extracting text from {file_path}: {e}")
339
+ return None
340
 
341
 
342
  def convert_pdf_to_image(file):
 
362
 
363
  def convert_doc_to_text(file_path):
364
  try:
365
+ result = subprocess.run(
366
+ ["antiword", file_path],
367
  capture_output=True,
368
  text=True,
369
  check=True,
370
  )
371
+ text = result.stdout.lstrip("\ufeff")
 
 
 
 
372
  return text
373
  except subprocess.CalledProcessError as e:
374
  print(f"Error converting {file_path} to text: {e}")