Spaces:

JackSparrow89
/

Semantic_File

Sleeping

File size: 3,805 Bytes

bb04c5f

# indexer/extractor.py

import os
import json
import fitz  # PyMuPDF
from docx import Document
from pptx import Presentation
from openpyxl import load_workbook


class Extractor:
    """

    Extracts raw text content from different file types.

    Each file type has its own extraction method.

    """

    def extract(self, filepath):
        """

        Main dispatcher — picks the right extraction method based on file extension.

        """
        handlers = {
            ".pdf": self.extract_pdf,
            ".docx": self.extract_docx,
            ".pptx": self.extract_pptx,
            ".xlsx": self.extract_xlsx,
            ".ipynb": self.extract_ipynb,
            ".txt": self.extract_text,
            ".md": self.extract_text,
            ".py": self.extract_text,
            ".js": self.extract_text,
        }

        try:
            ext = os.path.splitext(filepath)[1].lower()
            handler = handlers.get(ext)
            if handler:
                return handler(filepath)
            else:
                print(f"Warning: Unrecognized file extension: {ext}")
                return ""
        except Exception as e:
            print(f"Error extracting text from {filepath}: {e}")
            return ""

    def extract_pdf(self, filepath):
        """Extract text from a PDF file using PyMuPDF."""
        doc = fitz.open(filepath)
        pages = []
        for page in doc:
            pages.append(page.get_text())
        doc.close()
        return "\n".join(pages)

    def extract_docx(self, filepath):
        """Extract text from a Word document using python-docx."""
        doc = Document(filepath)
        paragraphs = []
        for para in doc.paragraphs:
            paragraphs.append(para.text)
        return "\n".join(paragraphs)

    def extract_pptx(self, filepath):
        """Extract text from a PowerPoint file using python-pptx."""
        prs = Presentation(filepath)
        lines = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    for para in shape.text_frame.paragraphs:
                        lines.append(para.text)
        return "\n".join(lines)

    def extract_xlsx(self, filepath):
        """Extract text from an Excel file using openpyxl."""
        wb = load_workbook(filepath, data_only=True)
        rows = []
        for sheet_name in wb.sheetnames:
            sheet = wb[sheet_name]
            for row in sheet.iter_rows():
                cells = []
                for cell in row:
                    if cell.value is not None:
                        cells.append(str(cell.value))
                rows.append(" ".join(cells))
        return "\n".join(rows)

    def extract_ipynb(self, filepath):
        """Extract text from a Jupyter notebook (.ipynb) file."""
        with open(filepath, "r", encoding="utf-8") as f:
            notebook = json.load(f)
        cells = []
        for cell in notebook["cells"]:
            cell_text = "".join(cell["source"])
            cells.append(cell_text)
        return "\n".join(cells)

    def extract_text(self, filepath):
        """Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()


# --- Test it ---
if __name__ == "__main__":
    import sys

    extractor = Extractor()

    if len(sys.argv) > 1:
        filepath = sys.argv[1]
        text = extractor.extract(filepath)
        print(f"Extracted {len(text)} characters from {filepath}")
        print(f"Preview:\n{text[:500]}")
    else:
        print("Usage: python -m indexer.extractor <filepath>")