Spaces:
Paused
Paused
File size: 6,009 Bytes
342973b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
"""
Document Parser - Extract and parse uploaded documents
"""
import os
import io
from typing import Dict, List, Tuple, Optional
from pathlib import Path
class DocumentParser:
"""
Parse various document formats (PDF, Word, text, Markdown) and extract content.
"""
def __init__(self):
"""Initialize the document parser."""
self.supported_formats = [".pdf", ".docx", ".doc", ".txt", ".md"]
def parse_file(self, file_path: str) -> Dict[str, any]:
"""
Parse a document file and extract content.
Args:
file_path: Path to the document file
Returns:
Dict containing:
- text: Extracted text content
- metadata: Document metadata
- sections: Parsed sections if available
- format: File format
"""
file_ext = Path(file_path).suffix.lower()
if file_ext not in self.supported_formats:
raise ValueError(f"Unsupported format: {file_ext}")
if file_ext == ".pdf":
return self._parse_pdf(file_path)
elif file_ext in [".docx", ".doc"]:
return self._parse_word(file_path)
elif file_ext in [".txt", ".md"]:
return self._parse_text(file_path)
return {"text": "", "metadata": {}, "sections": [], "format": file_ext}
def _parse_pdf(self, file_path: str) -> Dict[str, any]:
"""Parse PDF file."""
try:
import pdfplumber
content = []
metadata = {}
with pdfplumber.open(file_path) as pdf:
metadata["pages"] = len(pdf.pages)
metadata["title"] = pdf.metadata.get("Title", "Unknown")
for page in pdf.pages:
text = page.extract_text()
if text:
content.append(text)
return {
"text": "\n\n".join(content),
"metadata": metadata,
"sections": self._extract_sections("\n\n".join(content)),
"format": ".pdf",
}
except ImportError:
return {
"text": "PDF parsing requires pdfplumber",
"metadata": {},
"sections": [],
"format": ".pdf",
}
except Exception as e:
return {
"text": "",
"metadata": {"error": str(e)},
"sections": [],
"format": ".pdf",
}
def _parse_word(self, file_path: str) -> Dict[str, any]:
"""Parse Word document."""
try:
from docx import Document
doc = Document(file_path)
content = []
metadata = {"paragraphs": len(doc.paragraphs)}
for para in doc.paragraphs:
if para.text.strip():
content.append(para.text)
text = "\n\n".join(content)
return {
"text": text,
"metadata": metadata,
"sections": self._extract_sections(text),
"format": ".docx",
}
except ImportError:
return {
"text": "Word parsing requires python-docx",
"metadata": {},
"sections": [],
"format": ".docx",
}
except Exception as e:
return {
"text": "",
"metadata": {"error": str(e)},
"sections": [],
"format": ".docx",
}
def _parse_text(self, file_path: str) -> Dict[str, any]:
"""Parse plain text or Markdown file."""
try:
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
file_ext = Path(file_path).suffix.lower()
return {
"text": text,
"metadata": {"lines": len(text.split("\n"))},
"sections": self._extract_sections(text),
"format": file_ext,
}
except Exception as e:
return {
"text": "",
"metadata": {"error": str(e)},
"sections": [],
"format": Path(file_path).suffix.lower(),
}
def _extract_sections(self, text: str) -> List[Dict[str, str]]:
"""
Extract sections from text based on headers.
Args:
text: Document text
Returns:
List of sections with title and content
"""
sections = []
lines = text.split("\n")
current_section = None
current_content = []
for line in lines:
# Check for markdown headers
if line.startswith("#"):
if current_section:
sections.append(
{"title": current_section, "content": "\n".join(current_content)}
)
current_section = line.lstrip("#").strip()
current_content = []
elif line.strip():
if current_section:
current_content.append(line)
else:
current_section = "Introduction"
current_content.append(line)
if current_section and current_content:
sections.append({"title": current_section, "content": "\n".join(current_content)})
return sections if sections else [{"title": "Content", "content": text}]
def parse_text_input(self, text: str) -> Dict[str, any]:
"""
Parse raw text input.
Args:
text: Raw text content
Returns:
Parsed content dictionary
"""
return {
"text": text,
"metadata": {"lines": len(text.split("\n")), "words": len(text.split())},
"sections": self._extract_sections(text),
"format": ".txt",
}
|