Spaces:
Sleeping
Sleeping
File size: 3,805 Bytes
bb04c5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | # indexer/extractor.py
import os
import json
import fitz # PyMuPDF
from docx import Document
from pptx import Presentation
from openpyxl import load_workbook
class Extractor:
"""
Extracts raw text content from different file types.
Each file type has its own extraction method.
"""
def extract(self, filepath):
"""
Main dispatcher — picks the right extraction method based on file extension.
"""
handlers = {
".pdf": self.extract_pdf,
".docx": self.extract_docx,
".pptx": self.extract_pptx,
".xlsx": self.extract_xlsx,
".ipynb": self.extract_ipynb,
".txt": self.extract_text,
".md": self.extract_text,
".py": self.extract_text,
".js": self.extract_text,
}
try:
ext = os.path.splitext(filepath)[1].lower()
handler = handlers.get(ext)
if handler:
return handler(filepath)
else:
print(f"Warning: Unrecognized file extension: {ext}")
return ""
except Exception as e:
print(f"Error extracting text from {filepath}: {e}")
return ""
def extract_pdf(self, filepath):
"""Extract text from a PDF file using PyMuPDF."""
doc = fitz.open(filepath)
pages = []
for page in doc:
pages.append(page.get_text())
doc.close()
return "\n".join(pages)
def extract_docx(self, filepath):
"""Extract text from a Word document using python-docx."""
doc = Document(filepath)
paragraphs = []
for para in doc.paragraphs:
paragraphs.append(para.text)
return "\n".join(paragraphs)
def extract_pptx(self, filepath):
"""Extract text from a PowerPoint file using python-pptx."""
prs = Presentation(filepath)
lines = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
lines.append(para.text)
return "\n".join(lines)
def extract_xlsx(self, filepath):
"""Extract text from an Excel file using openpyxl."""
wb = load_workbook(filepath, data_only=True)
rows = []
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
for row in sheet.iter_rows():
cells = []
for cell in row:
if cell.value is not None:
cells.append(str(cell.value))
rows.append(" ".join(cells))
return "\n".join(rows)
def extract_ipynb(self, filepath):
"""Extract text from a Jupyter notebook (.ipynb) file."""
with open(filepath, "r", encoding="utf-8") as f:
notebook = json.load(f)
cells = []
for cell in notebook["cells"]:
cell_text = "".join(cell["source"])
cells.append(cell_text)
return "\n".join(cells)
def extract_text(self, filepath):
"""Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
# --- Test it ---
if __name__ == "__main__":
import sys
extractor = Extractor()
if len(sys.argv) > 1:
filepath = sys.argv[1]
text = extractor.extract(filepath)
print(f"Extracted {len(text)} characters from {filepath}")
print(f"Preview:\n{text[:500]}")
else:
print("Usage: python -m indexer.extractor <filepath>") |