campus-Me / src /ai_engine /document_parser.py
Mithun-999's picture
Complete AI Academic Document Suite
342973b
"""
Document Parser - Extract and parse uploaded documents
"""
import os
import io
from typing import Dict, List, Tuple, Optional
from pathlib import Path
class DocumentParser:
"""
Parse various document formats (PDF, Word, text, Markdown) and extract content.
"""
def __init__(self):
"""Initialize the document parser."""
self.supported_formats = [".pdf", ".docx", ".doc", ".txt", ".md"]
def parse_file(self, file_path: str) -> Dict[str, any]:
"""
Parse a document file and extract content.
Args:
file_path: Path to the document file
Returns:
Dict containing:
- text: Extracted text content
- metadata: Document metadata
- sections: Parsed sections if available
- format: File format
"""
file_ext = Path(file_path).suffix.lower()
if file_ext not in self.supported_formats:
raise ValueError(f"Unsupported format: {file_ext}")
if file_ext == ".pdf":
return self._parse_pdf(file_path)
elif file_ext in [".docx", ".doc"]:
return self._parse_word(file_path)
elif file_ext in [".txt", ".md"]:
return self._parse_text(file_path)
return {"text": "", "metadata": {}, "sections": [], "format": file_ext}
def _parse_pdf(self, file_path: str) -> Dict[str, any]:
"""Parse PDF file."""
try:
import pdfplumber
content = []
metadata = {}
with pdfplumber.open(file_path) as pdf:
metadata["pages"] = len(pdf.pages)
metadata["title"] = pdf.metadata.get("Title", "Unknown")
for page in pdf.pages:
text = page.extract_text()
if text:
content.append(text)
return {
"text": "\n\n".join(content),
"metadata": metadata,
"sections": self._extract_sections("\n\n".join(content)),
"format": ".pdf",
}
except ImportError:
return {
"text": "PDF parsing requires pdfplumber",
"metadata": {},
"sections": [],
"format": ".pdf",
}
except Exception as e:
return {
"text": "",
"metadata": {"error": str(e)},
"sections": [],
"format": ".pdf",
}
def _parse_word(self, file_path: str) -> Dict[str, any]:
"""Parse Word document."""
try:
from docx import Document
doc = Document(file_path)
content = []
metadata = {"paragraphs": len(doc.paragraphs)}
for para in doc.paragraphs:
if para.text.strip():
content.append(para.text)
text = "\n\n".join(content)
return {
"text": text,
"metadata": metadata,
"sections": self._extract_sections(text),
"format": ".docx",
}
except ImportError:
return {
"text": "Word parsing requires python-docx",
"metadata": {},
"sections": [],
"format": ".docx",
}
except Exception as e:
return {
"text": "",
"metadata": {"error": str(e)},
"sections": [],
"format": ".docx",
}
def _parse_text(self, file_path: str) -> Dict[str, any]:
"""Parse plain text or Markdown file."""
try:
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
file_ext = Path(file_path).suffix.lower()
return {
"text": text,
"metadata": {"lines": len(text.split("\n"))},
"sections": self._extract_sections(text),
"format": file_ext,
}
except Exception as e:
return {
"text": "",
"metadata": {"error": str(e)},
"sections": [],
"format": Path(file_path).suffix.lower(),
}
def _extract_sections(self, text: str) -> List[Dict[str, str]]:
"""
Extract sections from text based on headers.
Args:
text: Document text
Returns:
List of sections with title and content
"""
sections = []
lines = text.split("\n")
current_section = None
current_content = []
for line in lines:
# Check for markdown headers
if line.startswith("#"):
if current_section:
sections.append(
{"title": current_section, "content": "\n".join(current_content)}
)
current_section = line.lstrip("#").strip()
current_content = []
elif line.strip():
if current_section:
current_content.append(line)
else:
current_section = "Introduction"
current_content.append(line)
if current_section and current_content:
sections.append({"title": current_section, "content": "\n".join(current_content)})
return sections if sections else [{"title": "Content", "content": text}]
def parse_text_input(self, text: str) -> Dict[str, any]:
"""
Parse raw text input.
Args:
text: Raw text content
Returns:
Parsed content dictionary
"""
return {
"text": text,
"metadata": {"lines": len(text.split("\n")), "words": len(text.split())},
"sections": self._extract_sections(text),
"format": ".txt",
}