WebScraper.pro / app /services /export_service.py
LovnishVerma's picture
Update app/services/export_service.py
c62b5b9 verified
"""
Export service — generates JSON, CSV, and Excel files from scraped results.
Files are stored in the exports/ directory and tracked in the database.
"""
from __future__ import annotations
import csv
import json
import logging
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment
from app.models import db, ScrapeJob, ScrapeResult, ExportRecord
logger = logging.getLogger(__name__)
BASE_DIR = Path(__file__).resolve().parent.parent.parent
EXPORT_DIR = BASE_DIR / "exports"
def _ensure_export_dir() -> Path:
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
return EXPORT_DIR
def _get_results(job_id: int) -> list[ScrapeResult]:
return (
ScrapeResult.query.filter_by(job_id=job_id)
.order_by(ScrapeResult.page_num, ScrapeResult.item_index)
.all()
)
def _record_export(job_id: int, fmt: str, filename: str, filepath: str, row_count: int) -> ExportRecord:
size = os.path.getsize(filepath) if os.path.exists(filepath) else 0
record = ExportRecord(
job_id=job_id,
format=fmt,
filename=filename,
filepath=filepath,
file_size_bytes=size,
row_count=row_count,
)
db.session.add(record)
db.session.commit()
return record
def export_json(job_id: int) -> Optional[str]:
"""Export results as a JSON file. Returns the filepath."""
job = ScrapeJob.query.get(job_id)
if not job:
return None
results = _get_results(job_id)
export_dir = _ensure_export_dir()
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
filename = f"job_{job_id}_{ts}.json"
filepath = str(export_dir / filename)
data = {
"job": job.to_dict(),
"total_items": len(results),
"exported_at": datetime.now(timezone.utc).isoformat(),
"results": [r.to_dict() for r in results],
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
_record_export(job_id, "json", filename, filepath, len(results))
logger.info("JSON export for job %s: %s", job_id, filepath)
return filepath
def export_csv(job_id: int) -> Optional[str]:
"""Export results as a CSV file."""
job = ScrapeJob.query.get(job_id)
if not job:
return None
results = _get_results(job_id)
export_dir = _ensure_export_dir()
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
filename = f"job_{job_id}_{ts}.csv"
filepath = str(export_dir / filename)
fieldnames = ["id", "job_id", "page_num", "item_index", "page_url", "content_type", "content", "created_at"]
with open(filepath, "w", newline="", encoding="utf-8-sig") as f: # BOM for Excel compat
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
for r in results:
row = r.to_dict()
row.pop("metadata", None)
writer.writerow(row)
_record_export(job_id, "csv", filename, filepath, len(results))
logger.info("CSV export for job %s: %s", job_id, filepath)
return filepath
def export_excel(job_id: int) -> Optional[str]:
"""Export results as a styled Excel (.xlsx) file."""
job = ScrapeJob.query.get(job_id)
if not job:
return None
results = _get_results(job_id)
export_dir = _ensure_export_dir()
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
filename = f"job_{job_id}_{ts}.xlsx"
filepath = str(export_dir / filename)
wb = openpyxl.Workbook()
# --- Results sheet ---
ws = wb.active
ws.title = "Results"
header_fill = PatternFill("solid", fgColor="1A1A2E")
header_font = Font(color="FFFFFF", bold=True)
headers = ["#", "Page", "Index", "URL", "Type", "Content", "Scraped At"]
for col_idx, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col_idx, value=header)
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal="center")
for row_idx, r in enumerate(results, 2):
ws.cell(row=row_idx, column=1, value=r.id)
ws.cell(row=row_idx, column=2, value=r.page_num)
ws.cell(row=row_idx, column=3, value=r.item_index)
ws.cell(row=row_idx, column=4, value=r.page_url)
ws.cell(row=row_idx, column=5, value=r.content_type)
ws.cell(row=row_idx, column=6, value=(r.content or "")[:32767]) # Excel cell limit
ws.cell(row=row_idx, column=7, value=r.created_at.isoformat() if r.created_at else "")
ws.column_dimensions["D"].width = 40
ws.column_dimensions["F"].width = 60
# --- Summary sheet ---
ws2 = wb.create_sheet("Summary")
job_dict = job.to_dict()
ws2.cell(1, 1, "Field").font = Font(bold=True)
ws2.cell(1, 2, "Value").font = Font(bold=True)
for i, (k, v) in enumerate(job_dict.items(), 2):
ws2.cell(i, 1, k)
ws2.cell(i, 2, str(v))
ws2.column_dimensions["A"].width = 25
ws2.column_dimensions["B"].width = 50
wb.save(filepath)
_record_export(job_id, "excel", filename, filepath, len(results))
logger.info("Excel export for job %s: %s", job_id, filepath)
return filepath
def get_job_exports(job_id: int) -> list[ExportRecord]:
return ExportRecord.query.filter_by(job_id=job_id).order_by(ExportRecord.created_at.desc()).all()