Report-Generator / json_processor_v3.py
Jaimodiji's picture
Upload folder using huggingface_hub
92a22cd verified
import json
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from jsonschema import validate, ValidationError
import uuid
from flask import current_app, url_for
from werkzeug.utils import secure_filename
import sqlite3 # Import sqlite3
import sys
# Ensure current directory is in Python path for local imports
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from database import get_db_connection
from utils import create_a4_pdf_from_images
# JSON v3.0 Schema for validation
JSON_V3_SCHEMA = {
"type": "object",
"properties": {
"version": {"type": "string", "const": "3.0"},
"source": {"type": "string"},
"test_name": {"type": "string"},
"test_id": {"type": "string"},
"test_mapping_id": {"type": "string"},
"metadata": {"type": "object"},
"config": {
"type": "object",
"properties": {
"statuses_to_include": {"type": "array", "items": {"type": "string"}},
"layout": {
"type": "object",
"properties": {
"images_per_page": {"type": "integer"},
"orientation": {"type": "string"}
},
"required": ["images_per_page", "orientation"]
}
},
"required": ["statuses_to_include", "layout"]
},
"questions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"question_number": {"type": "string"},
"image_url": {"type": "string", "format": "uri"},
"status": {"type": "string"},
"marked_solution": {"type": "string"},
"correct_solution": {"type": "string"},
"subject": {"type": "string"},
"chapter": {"type": "string"},
"topic": {"type": "string"},
"time_taken": {"type": "integer"}
},
"required": ["question_number", "image_url", "status", "marked_solution", "correct_solution", "subject", "time_taken"]
}
},
"view": {"type": "boolean"}
},
"required": ["version", "source", "test_name", "test_id", "test_mapping_id", "config", "questions", "view"]
}
class JSONProcessorV3:
def __init__(self, data=None):
self.data = data
def validate(self):
"""Validates the JSON data against the v3.0 schema."""
try:
validate(instance=self.data, schema=JSON_V3_SCHEMA)
return True
except ValidationError as e:
raise ValueError(f"Schema validation failed: {e.message}")
def download_image_from_url(self, url, save_path, timeout=30):
"""Downloads an image from a URL and saves it to a path."""
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
with open(save_path, 'wb') as f:
f.write(response.content)
return save_path
except requests.exceptions.RequestException as e:
print(f"Error downloading image from {url}: {e}") # Keep print for tests
if current_app:
current_app.logger.error(f"Error downloading image from {url}: {e}")
return None
def download_images_parallel(self, questions, output_dir, session_id, max_workers=10):
"""Downloads all images in parallel and returns a map of question number to local path."""
image_paths = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_question = {
executor.submit(
self.download_image_from_url,
q['image_url'],
os.path.join(output_dir, f"{session_id}_q_{q['question_number']}.png")
): q for q in questions if q.get('image_url')
}
for future in as_completed(future_to_question):
question = future_to_question[future]
url = question['image_url']
try:
path = future.result()
if path:
image_paths[question['question_number']] = path
current_app.logger.info(f"Successfully downloaded image from {url}")
else:
current_app.logger.error(f"Failed to download image from {url}")
except Exception as e:
current_app.logger.error(f"Error processing image for question {question.get('question_number')} from {url}: {e}")
return image_paths
def process(self, user_id=1): # Default user_id for now, replace with actual user
"""Main processing logic for the v3.0 payload, including DB insertion and PDF generation."""
if not self.data:
raise ValueError("No data provided to process.")
current_app.logger.info("Starting processing of JSON v3.0 payload.")
current_app.logger.info(f"Test Name: {self.data.get('test_name')}")
current_app.logger.info(f"Test ID: {self.data.get('test_id')}")
current_app.logger.info(f"Metadata: {self.data.get('metadata')}")
if not self.validate():
raise ValueError("Schema validation failed.")
conn = get_db_connection()
try:
test_name = self.data['test_name']
test_id = self.data['test_id']
test_mapping_id = self.data['test_mapping_id']
questions_payload = self.data['questions']
view_mode = self.data.get('view', False)
metadata = json.dumps(self.data.get('metadata', {})) # Store metadata as JSON string
config = self.data.get('config', {})
layout = config.get('layout', {})
images_per_page = layout.get('images_per_page', 4)
orientation = layout.get('orientation', 'portrait')
session_id = str(uuid.uuid4())
original_filename = f"{test_name}.json" # Name of the JSON file that was uploaded
conn.execute(
'INSERT INTO sessions (id, original_filename, user_id, test_id, test_mapping_id, source, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)',
(session_id, original_filename, user_id, test_id, test_mapping_id, self.data.get('source', 'manual'), metadata)
)
processed_folder = current_app.config.get('PROCESSED_FOLDER', 'processed')
os.makedirs(processed_folder, exist_ok=True)
current_app.logger.info(f"Downloading images for test {test_id} to {processed_folder}")
image_path_map = self.download_images_parallel(questions_payload, processed_folder, session_id)
image_records = []
question_records = []
for i, q_data in enumerate(questions_payload):
question_number = q_data['question_number']
# Check if image was downloaded
processed_filename = None
local_image_path = image_path_map.get(question_number)
if local_image_path:
processed_filename = os.path.basename(local_image_path)
# Insert into images table
image_insert_result = conn.execute(
'INSERT INTO images (session_id, image_index, filename, original_name, processed_filename, image_type) VALUES (?, ?, ?, ?, ?, ?)',
(session_id, i + 1, q_data.get('image_url', ''), f"Question {question_number}", processed_filename, 'cropped' if processed_filename else 'original_url_only')
)
image_id = image_insert_result.lastrowid
# Insert into questions table
question_records.append((
session_id, image_id, question_number, q_data['status'],
q_data['marked_solution'], q_data['correct_solution'],
q_data.get('subject'), q_data.get('chapter'), q_data.get('topic'), q_data.get('time_taken')
))
conn.executemany(
'INSERT INTO questions (session_id, image_id, question_number, status, marked_solution, actual_solution, subject, chapter, topic, time_taken) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
question_records
)
conn.commit()
response_data = {
"status": "success",
"message": "JSON v3.0 processed successfully."
}
if view_mode:
query = "SELECT q.*, i.processed_filename FROM questions q JOIN images i ON q.image_id = i.id WHERE q.session_id = ? ORDER BY i.id"
all_questions = [dict(row) for row in conn.execute(query, (session_id,)).fetchall()]
if not all_questions:
conn.rollback()
raise ValueError('No questions found for PDF generation.')
pdf_output_folder = current_app.config.get('OUTPUT_FOLDER', 'output')
os.makedirs(pdf_output_folder, exist_ok=True)
pdf_filename = f"{secure_filename(test_name)}_{session_id[:8]}.pdf"
create_a4_pdf_from_images(
image_info=all_questions, base_folder=processed_folder, output_filename=pdf_filename,
images_per_page=images_per_page, output_folder=pdf_output_folder,
orientation=orientation
)
conn.execute(
'INSERT INTO generated_pdfs (session_id, filename, subject, tags, notes, source_filename, user_id) VALUES (?, ?, ?, ?, ?, ?, ?)',
(session_id, pdf_filename, test_name, test_mapping_id, 'Generated automatically via JSON v3.0 upload.', original_filename, user_id)
)
conn.commit()
response_data['view_url'] = url_for('main.view_pdf', filename=pdf_filename, _external=True)
response_data['message'] = "PDF auto-generated and saved."
else:
response_data['edit_url'] = url_for('main.question_entry_v2', session_id=session_id, test_name=test_name, _external=True)
response_data['message'] = "Session created for manual review."
return response_data
except ValueError as e:
if conn:
conn.rollback()
current_app.logger.error(f"JSON v3.0 processing error: {e}")
raise # Re-raise to be caught by the endpoint
except sqlite3.Error as e:
if conn:
conn.rollback()
current_app.logger.error(f"Database error during JSON v3.0 processing: {e}")
raise ValueError(f"Database error: {e}")
except Exception as e:
if conn:
conn.rollback()
current_app.logger.error(f"Unhandled error during JSON v3.0 processing: {e}")
raise ValueError(f"An unexpected error occurred: {e}")
finally:
if conn:
conn.close()