Jaimodiji's picture
Upload folder using huggingface_hub
92a22cd verified
import os
import re
import sqlite3
import sys
import uuid
from datetime import datetime, timedelta
import click
import fitz # PyMuPDF
import requests
from rich.progress import (
BarColumn,
Progress,
SpinnerColumn,
TextColumn,
TimeElapsedColumn,
TimeRemainingColumn,
)
from urllib.parse import urlparse
from werkzeug.utils import secure_filename
# --- Configuration ---
from utils import get_db_connection
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
UPLOAD_FOLDER = os.path.join(SCRIPT_DIR, 'uploads')
PROCESSED_FOLDER = os.path.join(SCRIPT_DIR, 'processed')
OUTPUT_FOLDER = os.path.join(SCRIPT_DIR, 'output')
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(PROCESSED_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# --- Core Logic Functions (mirrored from app.py) ---
def setup_database_cli():
"""Initializes the database and creates/updates tables as needed."""
conn = get_db_connection()
cursor = conn.cursor()
click.echo("Creating/updating tables...")
cursor.execute("CREATE TABLE IF NOT EXISTS sessions (id TEXT PRIMARY KEY, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, original_filename TEXT, persist INTEGER DEFAULT 0, subject TEXT, tags TEXT, notes TEXT);")
cursor.execute("CREATE TABLE IF NOT EXISTS images (id INTEGER PRIMARY KEY AUTOINCREMENT, session_id TEXT NOT NULL, image_index INTEGER NOT NULL, filename TEXT NOT NULL, original_name TEXT NOT NULL, processed_filename TEXT, image_type TEXT DEFAULT 'original', FOREIGN KEY (session_id) REFERENCES sessions (id));")
cursor.execute("CREATE TABLE IF NOT EXISTS questions (id INTEGER PRIMARY KEY AUTOINCREMENT, session_id TEXT NOT NULL, image_id INTEGER NOT NULL, question_number TEXT, subject TEXT, status TEXT, marked_solution TEXT, actual_solution TEXT, time_taken TEXT, tags TEXT, FOREIGN KEY (session_id) REFERENCES sessions (id), FOREIGN KEY (image_id) REFERENCES images (id));")
cursor.execute("CREATE TABLE IF NOT EXISTS folders (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, parent_id INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (parent_id) REFERENCES folders (id) ON DELETE CASCADE);")
cursor.execute("CREATE TABLE IF NOT EXISTS generated_pdfs (id INTEGER PRIMARY KEY AUTOINCREMENT, session_id TEXT NOT NULL, filename TEXT NOT NULL, subject TEXT NOT NULL, tags TEXT, notes TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, source_filename TEXT, folder_id INTEGER, persist INTEGER DEFAULT 0, FOREIGN KEY (session_id) REFERENCES sessions (id), FOREIGN KEY (folder_id) REFERENCES folders (id) ON DELETE SET NULL);")
cursor.execute("CREATE TABLE IF NOT EXISTS neetprep_questions (id TEXT PRIMARY KEY, question_text TEXT, options TEXT, correct_answer_index INTEGER, level TEXT, topic TEXT, subject TEXT, last_fetched_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP);")
cursor.execute("CREATE TABLE IF NOT EXISTS neetprep_processed_attempts (attempt_id TEXT PRIMARY KEY, processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP);")
# Add columns to sessions table if they don't exist
try:
cursor.execute("SELECT subject FROM sessions LIMIT 1")
except sqlite3.OperationalError:
cursor.execute("ALTER TABLE sessions ADD COLUMN subject TEXT")
try:
cursor.execute("SELECT tags FROM sessions LIMIT 1")
except sqlite3.OperationalError:
cursor.execute("ALTER TABLE sessions ADD COLUMN tags TEXT")
try:
cursor.execute("SELECT notes FROM sessions LIMIT 1")
except sqlite3.OperationalError:
cursor.execute("ALTER TABLE sessions ADD COLUMN notes TEXT")
try:
cursor.execute("SELECT tags FROM questions LIMIT 1")
except sqlite3.OperationalError:
cursor.execute("ALTER TABLE questions ADD COLUMN tags TEXT")
click.echo("Tables created successfully.")
conn.commit()
conn.close()
def cleanup_old_data_cli():
"""Removes sessions, files, and PDFs older than 1 day, unless persisted."""
conn = get_db_connection()
cutoff = datetime.now() - timedelta(days=1)
click.echo(f"Starting cleanup for items older than {cutoff.strftime('%Y-%m-%d %H:%M:%S')}:")
old_sessions = conn.execute('SELECT id FROM sessions WHERE created_at < ? AND persist = 0', (cutoff,)).fetchall()
click.echo(f"Found {len(old_sessions)} old, non-persisted sessions to delete.")
for session in old_sessions:
session_id = session['id']
images_to_delete = conn.execute('SELECT filename, processed_filename FROM images WHERE session_id = ?', (session_id,)).fetchall()
for img in images_to_delete:
if img['filename'] and os.path.exists(os.path.join(UPLOAD_FOLDER, img['filename'])): os.remove(os.path.join(UPLOAD_FOLDER, img['filename']))
if img['processed_filename'] and os.path.exists(os.path.join(PROCESSED_FOLDER, img['processed_filename'])): os.remove(os.path.join(PROCESSED_FOLDER, img['processed_filename']))
conn.execute('DELETE FROM questions WHERE session_id = ?', (session_id,))
conn.execute('DELETE FROM images WHERE session_id = ?', (session_id,))
conn.execute('DELETE FROM sessions WHERE id = ?', (session_id,))
old_pdfs = conn.execute('SELECT id, filename FROM generated_pdfs WHERE created_at < ? AND persist = 0', (cutoff,)).fetchall()
click.echo(f"Found {len(old_pdfs)} old, non-persisted generated PDFs to delete.")
for pdf in old_pdfs:
if os.path.exists(os.path.join(OUTPUT_FOLDER, pdf['filename'])): os.remove(os.path.join(OUTPUT_FOLDER, pdf['filename']))
conn.execute('DELETE FROM generated_pdfs WHERE id = ?', (pdf['id'],))
conn.commit()
conn.close()
def _get_local_pdf_path(path_or_url):
"""
Takes a path or URL. If it's a URL, downloads it to the UPLOAD_FOLDER.
Returns (local_path, original_filename, is_temp_file)
"""
is_url = path_or_url.lower().startswith(('http://', 'https://'))
if is_url:
click.echo(f"Downloading from URL: {path_or_url}")
try:
if "drive.google.com" in path_or_url:
file_id = path_or_url.split('/')[-2]
download_url = f'https://drive.google.com/uc?export=download&id={file_id}'
response = requests.get(download_url, stream=True)
content_disposition = response.headers.get('content-disposition')
if content_disposition:
filenames = re.findall('filename="(.+)"', content_disposition)
original_name = secure_filename(filenames[0]) if filenames else f"{str(uuid.uuid4())}.pdf"
else:
original_name = f"{str(uuid.uuid4())}.pdf"
elif path_or_url.lower().endswith('.pdf'):
response = requests.get(path_or_url, stream=True)
response.raise_for_status()
original_name = secure_filename(path_or_url.split('/')[-1]) or f"{str(uuid.uuid4())}.pdf"
else:
raise ValueError("URL is not a recognized Google Drive or direct .pdf link.")
local_path = os.path.join(UPLOAD_FOLDER, f"temp_{original_name}")
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return local_path, original_name, True
except Exception as e:
click.secho(f"Error downloading file: {e}", fg="red", err=True)
return None, None, False
else:
if not os.path.exists(path_or_url):
click.secho(f"Error: File not found at {path_or_url}", fg="red", err=True)
return None, None, False
return path_or_url, secure_filename(os.path.basename(path_or_url)), False
# --- CLI Group ---
@click.group()
def cli():
"""A CLI for managing the Report Generator application."""
pass
# --- CLI Commands ---
@cli.command()
def db_init():
"""Initializes or updates the database schema."""
click.secho("Initializing database schema...", fg="yellow")
setup_database_cli()
click.secho("Database schema is up to date.", fg="green")
@cli.command()
def db_cleanup():
"""Cleans up old, non-persisted data."""
click.secho("Starting cleanup of old data...", fg="yellow")
cleanup_old_data_cli()
click.secho("Cleanup finished.", fg="green")
@cli.command('add-question')
@click.option('--session-id', required=True, type=click.STRING)
@click.option('--image-path', required=True, type=click.Path(exists=True))
@click.option('--q-num', type=click.STRING)
@click.option('--status', type=click.Choice(['Correct', 'Wrong', 'Unattempted']))
@click.option('--marked-ans', type=click.STRING)
@click.option('--correct-ans', type=click.STRING)
@click.option('--subject', type=click.STRING)
@click.option('--time', type=click.STRING)
def add_question(session_id, image_path, q_num, status, marked_ans, correct_ans, subject, time):
"""Adds a single question with metadata to the database."""
setup_database_cli() # Ensure database tables exist
try:
conn = get_db_connection()
cursor = conn.cursor()
# 1. Copy image to processed folder
original_filename = secure_filename(os.path.basename(image_path))
processed_filename = f"processed_{session_id}_{str(uuid.uuid4())[:8]}_{original_filename}"
processed_path = os.path.join(PROCESSED_FOLDER, processed_filename)
import shutil
shutil.copy(image_path, processed_path)
# 2. Create a new image record
# Find the next available image_index for the session
cursor.execute("SELECT MAX(image_index) FROM images WHERE session_id = ?", (session_id,))
max_index = cursor.fetchone()[0]
new_index = (max_index or -1) + 1
cursor.execute(
'INSERT INTO images (session_id, image_index, filename, original_name, processed_filename, image_type) VALUES (?, ?, ?, ?, ?, ?)',
(session_id, new_index, original_filename, original_filename, processed_filename, 'cropped')
)
image_id = cursor.lastrowid
# 3. Create a new question record
cursor.execute(
'INSERT INTO questions (session_id, image_id, question_number, status, marked_solution, actual_solution, subject, time_taken) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
(session_id, image_id, q_num, status, marked_ans, correct_ans, subject, time)
)
conn.commit()
click.secho(f"Successfully added question {q_num} (Image ID: {image_id}) to session {session_id}.", fg="green")
except Exception as e:
click.secho(f"Error adding question: {e}", fg="red", err=True)
raise click.Abort()
finally:
if conn:
conn.close()
@cli.command('upload')
@click.argument('pdf_paths', type=click.STRING)
@click.option('--simple-progress', is_flag=True, help='Print simple percentage progress to stdout.')
@click.option('--final', is_flag=True, help='Mark the PDF as a final version and add to generated_pdfs table.')
@click.option('--subject', type=click.STRING, help='Subject for the final PDF.')
@click.option('--tags', type=click.STRING, help='Tags for the final PDF (comma-separated).')
@click.option('--notes', type=click.STRING, help='Notes for the final PDF.')
@click.option('--log', is_flag=True, help='Log all output to cli.log.')
def upload(pdf_paths, simple_progress, final, subject, tags, notes, log):
"""
A CLI tool to upload a large PDF directly to the application's database.
PDF_PATHS: A comma-separated list of full paths to the PDF files you wish to upload or Google Drive URLs.
"""
setup_database_cli() # Ensure database tables exist
if log:
try:
log_f = open('cli.log', 'a')
sys.stdout = log_f
sys.stderr = log_f
except Exception as e:
click.secho(f"Error opening log file: {e}", fg="red", err=True)
raise click.Abort()
click.echo(f"--- Log entry: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---")
click.echo(f"Arguments: pdf_paths={pdf_paths}, simple_progress={simple_progress}, final={final}, subject={subject}, tags={tags}, notes={notes}, log={log}")
click.echo("---" * 20)
files_to_process = [p.strip() for p in pdf_paths.split(',')]
for pdf_path_or_url in files_to_process:
click.secho(f"--- Processing: {click.style(pdf_path_or_url, bold=True)} ---", fg="yellow")
local_pdf_path, original_filename, is_temp = _get_local_pdf_path(pdf_path_or_url)
if not local_pdf_path:
continue
try:
if final:
if not subject:
click.secho("Error: --subject is required when using --final.", fg="red", err=True)
raise click.Abort()
session_id = str(uuid.uuid4())
conn = get_db_connection()
cursor = conn.cursor()
cursor.execute('INSERT INTO sessions (id, original_filename) VALUES (?, ?)',
(session_id, original_filename))
output_filename = original_filename
output_path = os.path.join(OUTPUT_FOLDER, output_filename)
if os.path.exists(output_path):
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f"{timestamp}_{original_filename}"
output_path = os.path.join(OUTPUT_FOLDER, output_filename)
click.secho(f"Warning: File '{original_filename}' already exists. Saving as '{output_filename}'.", fg="yellow")
import shutil
shutil.copy(local_pdf_path, output_path)
cursor.execute(
'INSERT INTO generated_pdfs (session_id, filename, subject, tags, notes, source_filename) VALUES (?, ?, ?, ?, ?, ?)',
(session_id, output_filename, subject, tags, notes, original_filename)
)
conn.commit()
conn.close()
click.secho(f"Successfully added final PDF '{original_filename}' to the database.", fg="green")
else: # Standard page-extraction mode
click.echo(f"Processing PDF: {click.style(original_filename, bold=True)}")
session_id = str(uuid.uuid4())
doc = fitz.open(local_pdf_path)
num_pages = len(doc)
if num_pages == 0:
click.secho("Warning: This PDF has 0 pages. Nothing to process.", fg="yellow")
continue
click.echo(f"PDF contains {num_pages} pages to process.")
conn = get_db_connection()
cursor = conn.cursor()
cursor.execute('INSERT INTO sessions (id, original_filename) VALUES (?, ?)',
(session_id, original_filename))
click.echo(f"Created session: {click.style(session_id, fg='cyan')}")
images_to_insert = []
if simple_progress:
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=150)
page_filename = f"{session_id}_page_{i}.png"
page_path = os.path.join(UPLOAD_FOLDER, page_filename)
pix.save(page_path)
images_to_insert.append(
(session_id, i, page_filename, f"Page {i + 1}", 'original')
)
percentage = int(((i + 1) / num_pages) * 100)
sys.stdout.write(f"{percentage}\n")
sys.stdout.flush()
else:
progress = Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(bar_width=None),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TextColumn("• Page {task.completed}/{task.total}"),
TextColumn("• Elapsed:"), TimeElapsedColumn(),
TextColumn("• Remaining:"), TimeRemainingColumn(),
)
with progress:
task = progress.add_task("[green]Extracting pages...", total=num_pages)
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=150)
page_filename = f"{session_id}_page_{i}.png"
page_path = os.path.join(UPLOAD_FOLDER, page_filename)
pix.save(page_path)
images_to_insert.append(
(session_id, i, page_filename, f"Page {i + 1}", 'original')
)
progress.update(task, advance=1)
click.echo("\nInserting image records into the database...")
cursor.executemany(
'INSERT INTO images (session_id, image_index, filename, original_name, image_type) VALUES (?, ?, ?, ?, ?)',
images_to_insert
)
conn.commit()
click.secho(f"Successfully committed {len(images_to_insert)} records to the database.", fg="green")
doc.close()
except Exception as e:
click.secho(f"An unexpected error occurred while processing {original_filename}: {e}", fg="red", err=True)
finally:
if is_temp and os.path.exists(local_pdf_path):
os.remove(local_pdf_path)
click.secho(f"\n✅ All done! Upload complete for '{original_filename}'.", fg="green", bold=True)
if __name__ == '__main__':
cli()