File size: 2,281 Bytes
f373e2b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import zipfile
import re
import xml.etree.ElementTree as ET
def extract_text_from_docx(file_path):
print(f"--- Extracting from {os.path.basename(file_path)} ---")
try:
with zipfile.ZipFile(file_path) as z:
xml_content = z.read("word/document.xml")
tree = ET.fromstring(xml_content)
# Find all text nodes in w:t
namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
text_nodes = tree.findall(".//w:t", namespaces)
text = [node.text for node in text_nodes if node.text]
print("\n".join(text)[:2000] + "..." if len(text) > 2000 else "\n".join(text))
except Exception as e:
print(f"Error reading docx {file_path}: {e}")
def extract_text_from_pptx(file_path):
print(f"--- Extracting from {os.path.basename(file_path)} ---")
try:
with zipfile.ZipFile(file_path) as z:
# Find slides
slides = [f for f in z.namelist() if f.startswith("ppt/slides/slide") and f.endswith(".xml")]
slides.sort() # Sort by name (approximate order)
for slide in slides:
xml_content = z.read(slide)
tree = ET.fromstring(xml_content)
namespaces = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
# Text is usually in a:t
text_nodes = tree.findall(".//a:t", namespaces)
text = [node.text for node in text_nodes if node.text]
if text:
print(f"\n[Slide {slide}]:")
print("\n".join(text))
except Exception as e:
print(f"Error reading pptx {file_path}: {e}")
if __name__ == "__main__":
files = [
"Review2 - Project Template - B.Tech.docx",
"Rubrics_review_evaluation-REVIEW_2.docx",
"Review_PPT_4-2_2 (1).pptx"
]
base_dir = "/home/kbs/final_project"
for f in files:
path = os.path.join(base_dir, f)
if os.path.exists(path):
if f.endswith(".docx"):
extract_text_from_docx(path)
elif f.endswith(".pptx"):
extract_text_from_pptx(path)
else:
print(f"File not found: {path}")
|