Spaces:

JackSparrow89
/

Semantic_File

Sleeping

App Files Files Community

Semantic_File / indexer /extractor.py

JackSparrow89

Upload 65 files

bb04c5f verified about 1 month ago

raw

history blame contribute delete

3.81 kB

	# indexer/extractor.py

	import os
	import json
	import fitz # PyMuPDF
	from docx import Document
	from pptx import Presentation
	from openpyxl import load_workbook


	class Extractor:
	"""
	Extracts raw text content from different file types.
	Each file type has its own extraction method.
	"""

	def extract(self, filepath):
	"""
	Main dispatcher — picks the right extraction method based on file extension.
	"""
	handlers = {
	".pdf": self.extract_pdf,
	".docx": self.extract_docx,
	".pptx": self.extract_pptx,
	".xlsx": self.extract_xlsx,
	".ipynb": self.extract_ipynb,
	".txt": self.extract_text,
	".md": self.extract_text,
	".py": self.extract_text,
	".js": self.extract_text,
	}

	try:
	ext = os.path.splitext(filepath)[1].lower()
	handler = handlers.get(ext)
	if handler:
	return handler(filepath)
	else:
	print(f"Warning: Unrecognized file extension: {ext}")
	return ""
	except Exception as e:
	print(f"Error extracting text from {filepath}: {e}")
	return ""

	def extract_pdf(self, filepath):
	"""Extract text from a PDF file using PyMuPDF."""
	doc = fitz.open(filepath)
	pages = []
	for page in doc:
	pages.append(page.get_text())
	doc.close()
	return "\n".join(pages)

	def extract_docx(self, filepath):
	"""Extract text from a Word document using python-docx."""
	doc = Document(filepath)
	paragraphs = []
	for para in doc.paragraphs:
	paragraphs.append(para.text)
	return "\n".join(paragraphs)

	def extract_pptx(self, filepath):
	"""Extract text from a PowerPoint file using python-pptx."""
	prs = Presentation(filepath)
	lines = []
	for slide in prs.slides:
	for shape in slide.shapes:
	if shape.has_text_frame:
	for para in shape.text_frame.paragraphs:
	lines.append(para.text)
	return "\n".join(lines)

	def extract_xlsx(self, filepath):
	"""Extract text from an Excel file using openpyxl."""
	wb = load_workbook(filepath, data_only=True)
	rows = []
	for sheet_name in wb.sheetnames:
	sheet = wb[sheet_name]
	for row in sheet.iter_rows():
	cells = []
	for cell in row:
	if cell.value is not None:
	cells.append(str(cell.value))
	rows.append(" ".join(cells))
	return "\n".join(rows)

	def extract_ipynb(self, filepath):
	"""Extract text from a Jupyter notebook (.ipynb) file."""
	with open(filepath, "r", encoding="utf-8") as f:
	notebook = json.load(f)
	cells = []
	for cell in notebook["cells"]:
	cell_text = "".join(cell["source"])
	cells.append(cell_text)
	return "\n".join(cells)

	def extract_text(self, filepath):
	"""Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
	with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()


	# --- Test it ---
	if __name__ == "__main__":
	import sys

	extractor = Extractor()

	if len(sys.argv) > 1:
	filepath = sys.argv[1]
	text = extractor.extract(filepath)
	print(f"Extracted {len(text)} characters from {filepath}")
	print(f"Preview:\n{text[:500]}")
	else:
	print("Usage: python -m indexer.extractor <filepath>")