Ragcore / app /utils /parsers.py
NinjainPJs's picture
Initial deploy: RagCore RAG system with hybrid search and Gradio UI
a34068e
import logging
from pathlib import Path
from app.utils.helpers import clean_text
logger = logging.getLogger(__name__)
SUPPORTED_EXTENSIONS = {".pdf", ".txt", ".html", ".htm"}
def parse_pdf(file_bytes: bytes, filename: str) -> str:
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(file_bytes))
pages = []
for page in reader.pages:
text = page.extract_text()
if text:
pages.append(text)
raw = "\n\n".join(pages)
logger.info(f"Parsed PDF '{filename}': {len(reader.pages)} pages, {len(raw)} chars")
return clean_text(raw)
except Exception as e:
logger.error(f"Failed to parse PDF '{filename}': {e}")
return ""
def parse_text(file_bytes: bytes, filename: str) -> str:
try:
text = file_bytes.decode("utf-8")
except UnicodeDecodeError:
text = file_bytes.decode("latin-1")
logger.info(f"Parsed text '{filename}': {len(text)} chars")
return clean_text(text)
def parse_html(file_bytes: bytes, filename: str) -> str:
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(file_bytes, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
text = soup.get_text(separator="\n")
logger.info(f"Parsed HTML '{filename}': {len(text)} chars")
return clean_text(text)
except Exception as e:
logger.error(f"Failed to parse HTML '{filename}': {e}")
return ""
def parse_document(file_bytes: bytes, filename: str) -> str:
ext = Path(filename).suffix.lower()
if ext == ".pdf":
return parse_pdf(file_bytes, filename)
elif ext in (".html", ".htm"):
return parse_html(file_bytes, filename)
elif ext == ".txt":
return parse_text(file_bytes, filename)
else:
logger.warning(f"Unsupported file type '{ext}' for '{filename}'")
return ""
def get_page_count(file_bytes: bytes, filename: str) -> int | None:
ext = Path(filename).suffix.lower()
if ext == ".pdf":
try:
from pypdf import PdfReader
from io import BytesIO
return len(PdfReader(BytesIO(file_bytes)).pages)
except Exception:
return None
return None