|
|
from flask import Blueprint, request, jsonify, render_template
|
|
|
import os
|
|
|
import re
|
|
|
import fitz
|
|
|
from PIL import Image
|
|
|
from werkzeug.utils import secure_filename
|
|
|
try:
|
|
|
from pix2text import Pix2Text
|
|
|
PIX2TEXT_AVAILABLE = True
|
|
|
except ImportError:
|
|
|
PIX2TEXT_AVAILABLE = False
|
|
|
print("⚠️ Pix2Text not available. Install with: pip install pix2text")
|
|
|
|
|
|
pdffly_bp = Blueprint('pdffly', __name__)
|
|
|
UPLOAD_FOLDER = 'static/uploads'
|
|
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
|
|
|
|
|
|
|
if PIX2TEXT_AVAILABLE:
|
|
|
print("🔹 Loading Pix2Text model for PDF → LaTeX...")
|
|
|
try:
|
|
|
p2t = Pix2Text()
|
|
|
print("✅ Pix2Text model loaded successfully")
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ Error loading Pix2Text: {e}")
|
|
|
p2t = None
|
|
|
else:
|
|
|
p2t = None
|
|
|
|
|
|
def pdf_to_images(pdf_path):
|
|
|
"""Convert PDF pages to images"""
|
|
|
doc = fitz.open(pdf_path)
|
|
|
image_paths = []
|
|
|
for i, page in enumerate(doc):
|
|
|
pix = page.get_pixmap(dpi=200)
|
|
|
img_path = os.path.join(UPLOAD_FOLDER, f"page_{i+1}.png")
|
|
|
pix.save(img_path)
|
|
|
image_paths.append(img_path)
|
|
|
doc.close()
|
|
|
return image_paths
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path):
|
|
|
"""Extract raw text from PDF (fallback method)"""
|
|
|
doc = fitz.open(pdf_path)
|
|
|
all_text = []
|
|
|
for page_num, page in enumerate(doc):
|
|
|
text = page.get_text()
|
|
|
all_text.append(f"Page {page_num + 1}:\n{text}\n")
|
|
|
doc.close()
|
|
|
return "\n".join(all_text)
|
|
|
|
|
|
def clean_latex_code(latex_str):
|
|
|
"""Clean and format LaTeX code for Overleaf compilation"""
|
|
|
if not latex_str or not isinstance(latex_str, str):
|
|
|
return ""
|
|
|
|
|
|
|
|
|
latex_str = re.sub(r'\\operatorname\*?\s*\{\s*([a-z])\s+([a-z])\s+([a-z])\s*\}',
|
|
|
lambda m: f'\\{m.group(1)}{m.group(2)}{m.group(3)}', latex_str)
|
|
|
|
|
|
|
|
|
replacements = {
|
|
|
r'\\operatorname\s*\{\s*l\s+i\s+m\s*\}': r'\\lim',
|
|
|
r'\\operatorname\s*\{\s*s\s+i\s+n\s*\}': r'\\sin',
|
|
|
r'\\operatorname\s*\{\s*c\s+o\s+s\s*\}': r'\\cos',
|
|
|
r'\\operatorname\s*\{\s*t\s+a\s+n\s*\}': r'\\tan',
|
|
|
r'\\operatorname\s*\{\s*l\s+o\s+g\s*\}': r'\\log',
|
|
|
r'\\operatorname\s*\{\s*l\s+n\s*\}': r'\\ln',
|
|
|
r'\\operatorname\s*\{\s*e\s+x\s+p\s*\}': r'\\exp',
|
|
|
r'\\operatorname\s*\{\s*m\s+a\s+x\s*\}': r'\\max',
|
|
|
r'\\operatorname\s*\{\s*m\s+i\s+n\s*\}': r'\\min',
|
|
|
}
|
|
|
|
|
|
for pattern, replacement in replacements.items():
|
|
|
latex_str = re.sub(pattern, replacement, latex_str, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
latex_str = re.sub(r'\\operatorname\*?\s*\{([^}]+)\}',
|
|
|
lambda m: f'\\operatorname{{{m.group(1).replace(" ", "")}}}', latex_str)
|
|
|
|
|
|
|
|
|
latex_str = re.sub(r'\$\$([^$]+)\$\$', r'\\[\1\\]', latex_str)
|
|
|
|
|
|
|
|
|
latex_str = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]+', '', latex_str)
|
|
|
|
|
|
|
|
|
open_braces = latex_str.count('{')
|
|
|
close_braces = latex_str.count('}')
|
|
|
if open_braces > close_braces:
|
|
|
latex_str += '}' * (open_braces - close_braces)
|
|
|
elif close_braces > open_braces:
|
|
|
latex_str = '{' * (close_braces - open_braces) + latex_str
|
|
|
|
|
|
|
|
|
open_brackets = latex_str.count('[')
|
|
|
close_brackets = latex_str.count(']')
|
|
|
if open_brackets > close_brackets:
|
|
|
latex_str += ']' * (open_brackets - close_brackets)
|
|
|
elif close_brackets > open_brackets:
|
|
|
latex_str = '[' * (close_brackets - open_brackets) + latex_str
|
|
|
|
|
|
|
|
|
latex_str = re.sub(r'\\\\+', r'\\\\', latex_str)
|
|
|
latex_str = re.sub(r'\s+', ' ', latex_str)
|
|
|
|
|
|
return latex_str.strip()
|
|
|
|
|
|
def create_complete_latex_document(latex_content, title="PDF to LaTeX Conversion"):
|
|
|
"""Wrap LaTeX content in a complete compilable document"""
|
|
|
document = r'''\documentclass{article}
|
|
|
\usepackage{amsmath}
|
|
|
\usepackage{amssymb}
|
|
|
\usepackage{amsfonts}
|
|
|
\usepackage{graphicx}
|
|
|
|
|
|
\title{''' + title + r'''}
|
|
|
\author{PDFly}
|
|
|
\date{\today}
|
|
|
|
|
|
\begin{document}
|
|
|
|
|
|
\maketitle
|
|
|
|
|
|
\begin{center}
|
|
|
\textit{This document was automatically generated from a PDF file using OCR and LaTeX conversion.}
|
|
|
\end{center}
|
|
|
|
|
|
\section*{Content}
|
|
|
|
|
|
''' + latex_content + r'''
|
|
|
|
|
|
\end{document}'''
|
|
|
|
|
|
return document
|
|
|
|
|
|
@pdffly_bp.route("/", methods=["GET"])
|
|
|
def pdffly_page():
|
|
|
"""Render the main PDFfly page."""
|
|
|
return render_template("pdffly.html")
|
|
|
|
|
|
@pdffly_bp.route('/upload', methods=['POST'])
|
|
|
def upload_and_convert_pdf():
|
|
|
"""Upload PDF and convert to LaTeX"""
|
|
|
if 'file' not in request.files:
|
|
|
return jsonify({'error': 'No file found'}), 400
|
|
|
|
|
|
file = request.files['file']
|
|
|
if not file or file.filename == '':
|
|
|
return jsonify({'error': 'No file selected'}), 400
|
|
|
|
|
|
if not file.filename.lower().endswith('.pdf'):
|
|
|
return jsonify({'error': 'Only PDF files are allowed'}), 400
|
|
|
|
|
|
filename = secure_filename(file.filename)
|
|
|
pdf_path = os.path.join(UPLOAD_FOLDER, filename)
|
|
|
file.save(pdf_path)
|
|
|
|
|
|
try:
|
|
|
|
|
|
doc = fitz.open(pdf_path)
|
|
|
page_count = len(doc)
|
|
|
doc.close()
|
|
|
|
|
|
|
|
|
images = pdf_to_images(pdf_path)
|
|
|
|
|
|
|
|
|
latex_results = []
|
|
|
all_latex_pages = []
|
|
|
for i, img_path in enumerate(images):
|
|
|
try:
|
|
|
if p2t:
|
|
|
result = p2t.recognize(img_path, resized_shape=768)
|
|
|
latex_code = result if isinstance(result, str) else str(result)
|
|
|
|
|
|
latex_code = clean_latex_code(latex_code)
|
|
|
all_latex_pages.append(f"% Page {i + 1}\n{latex_code}")
|
|
|
else:
|
|
|
|
|
|
latex_code = f"Text extraction (Pix2Text not available)"
|
|
|
all_latex_pages.append(latex_code)
|
|
|
|
|
|
latex_results.append({
|
|
|
'page': i + 1,
|
|
|
'image': img_path.replace('static/', '/static/'),
|
|
|
'latex': latex_code
|
|
|
})
|
|
|
except Exception as e:
|
|
|
latex_results.append({
|
|
|
'page': i + 1,
|
|
|
'image': img_path.replace('static/', '/static/'),
|
|
|
'error': str(e)
|
|
|
})
|
|
|
all_latex_pages.append(f"% Page {i + 1}: Error - {str(e)}")
|
|
|
|
|
|
|
|
|
combined_latex = "\n\n".join(all_latex_pages)
|
|
|
complete_document = create_complete_latex_document(combined_latex, filename)
|
|
|
|
|
|
return jsonify({
|
|
|
'success': True,
|
|
|
'message': 'PDF converted successfully!',
|
|
|
'pdf_path': pdf_path.replace('static/', '/static/'),
|
|
|
'filename': filename,
|
|
|
'pages': page_count,
|
|
|
'results': latex_results,
|
|
|
'complete_document': complete_document
|
|
|
})
|
|
|
|
|
|
except Exception as e:
|
|
|
return jsonify({
|
|
|
'success': False,
|
|
|
'error': f'Error processing PDF: {str(e)}'
|
|
|
}), 500
|
|
|
|
|
|
@pdffly_bp.route('/process', methods=['POST'])
|
|
|
def process_pdf():
|
|
|
"""Process specific area or entire PDF"""
|
|
|
data = request.get_json()
|
|
|
filename = data.get('filename')
|
|
|
convert_all = data.get('convert_all', False)
|
|
|
page_num = data.get('page', 0)
|
|
|
coordinates = data.get('coordinates')
|
|
|
|
|
|
if not filename:
|
|
|
return jsonify({'success': False, 'error': 'No filename provided'}), 400
|
|
|
|
|
|
pdf_path = os.path.join(UPLOAD_FOLDER, filename)
|
|
|
|
|
|
if not os.path.exists(pdf_path):
|
|
|
return jsonify({'success': False, 'error': 'PDF file not found'}), 404
|
|
|
|
|
|
try:
|
|
|
if convert_all:
|
|
|
|
|
|
text = extract_text_from_pdf(pdf_path)
|
|
|
latex = f"\\text{{{text}}}"
|
|
|
else:
|
|
|
|
|
|
doc = fitz.open(pdf_path)
|
|
|
if page_num < len(doc):
|
|
|
page = doc[page_num]
|
|
|
text = page.get_text()
|
|
|
latex = f"\\text{{{text}}}"
|
|
|
else:
|
|
|
latex = "Page not found"
|
|
|
doc.close()
|
|
|
|
|
|
return jsonify({
|
|
|
'success': True,
|
|
|
'latex': latex
|
|
|
})
|
|
|
|
|
|
except Exception as e:
|
|
|
return jsonify({
|
|
|
'success': False,
|
|
|
'error': str(e)
|
|
|
}), 500
|
|
|
|
|
|
@pdffly_bp.route('/solve', methods=['POST'])
|
|
|
def solve_latex():
|
|
|
"""Solve mathematical content"""
|
|
|
data = request.get_json()
|
|
|
latex = data.get('latex', '')
|
|
|
|
|
|
|
|
|
return jsonify({
|
|
|
'success': True,
|
|
|
'solution': {
|
|
|
'type': 'info',
|
|
|
'message': 'Math solver integration pending'
|
|
|
}
|
|
|
})
|
|
|
|