Semantic_File / indexer /extractor.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# indexer/extractor.py
import os
import json
import fitz # PyMuPDF
from docx import Document
from pptx import Presentation
from openpyxl import load_workbook
class Extractor:
"""
Extracts raw text content from different file types.
Each file type has its own extraction method.
"""
def extract(self, filepath):
"""
Main dispatcher — picks the right extraction method based on file extension.
"""
handlers = {
".pdf": self.extract_pdf,
".docx": self.extract_docx,
".pptx": self.extract_pptx,
".xlsx": self.extract_xlsx,
".ipynb": self.extract_ipynb,
".txt": self.extract_text,
".md": self.extract_text,
".py": self.extract_text,
".js": self.extract_text,
}
try:
ext = os.path.splitext(filepath)[1].lower()
handler = handlers.get(ext)
if handler:
return handler(filepath)
else:
print(f"Warning: Unrecognized file extension: {ext}")
return ""
except Exception as e:
print(f"Error extracting text from {filepath}: {e}")
return ""
def extract_pdf(self, filepath):
"""Extract text from a PDF file using PyMuPDF."""
doc = fitz.open(filepath)
pages = []
for page in doc:
pages.append(page.get_text())
doc.close()
return "\n".join(pages)
def extract_docx(self, filepath):
"""Extract text from a Word document using python-docx."""
doc = Document(filepath)
paragraphs = []
for para in doc.paragraphs:
paragraphs.append(para.text)
return "\n".join(paragraphs)
def extract_pptx(self, filepath):
"""Extract text from a PowerPoint file using python-pptx."""
prs = Presentation(filepath)
lines = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
lines.append(para.text)
return "\n".join(lines)
def extract_xlsx(self, filepath):
"""Extract text from an Excel file using openpyxl."""
wb = load_workbook(filepath, data_only=True)
rows = []
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
for row in sheet.iter_rows():
cells = []
for cell in row:
if cell.value is not None:
cells.append(str(cell.value))
rows.append(" ".join(cells))
return "\n".join(rows)
def extract_ipynb(self, filepath):
"""Extract text from a Jupyter notebook (.ipynb) file."""
with open(filepath, "r", encoding="utf-8") as f:
notebook = json.load(f)
cells = []
for cell in notebook["cells"]:
cell_text = "".join(cell["source"])
cells.append(cell_text)
return "\n".join(cells)
def extract_text(self, filepath):
"""Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
# --- Test it ---
if __name__ == "__main__":
import sys
extractor = Extractor()
if len(sys.argv) > 1:
filepath = sys.argv[1]
text = extractor.extract(filepath)
print(f"Extracted {len(text)} characters from {filepath}")
print(f"Preview:\n{text[:500]}")
else:
print("Usage: python -m indexer.extractor <filepath>")