Spaces:

prithivMLmods
/

Qwen-Image-LoRA-DLC

Running on Zero

App Files Files Community

Qwen-Image-LoRA-DLC / app.py

prithivMLmods

Update app.py

ac626f9 verified 2 months ago

raw

history blame

11.9 kB

	import os
	import gradio_pdf
	import hashlib
	import spaces
	import re
	import time
	import click
	import gradio as gr
	from io import BytesIO
	from PIL import Image
	from gradio_pdf import PDF
	from loguru import logger
	import sys # Added for logging configuration
	import base64 # Added for image encoding
	from bs4 import BeautifulSoup # Added for HTML manipulation
	from datetime import datetime
	from pathlib import Path
	import torch
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
	from transformers.image_utils import load_image
	import fitz # PyMuPDF library for PDF processing
	import html2text
	import markdown
	import tempfile

	# Define supported file suffixes
	pdf_suffixes = [".pdf"]
	image_suffixes = [".png", ".jpeg", ".jpg"]

	# --- Model and Processor Initialization ---
	device = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_ID = "Logics-MLLM/Logics-Parsing"
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	).to(device).eval()

	@spaces.GPU
	def parse_page(image: Image.Image) -> str:
	"""
	Parses a single document page image using the Qwen2.5-VL model.
	"""
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."},
	],
	},
	]
	prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(
	text=[prompt_full],
	images=[image],
	return_tensors="pt",
	padding=True,
	).to(device)

	with torch.no_grad():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=2048,
	temperature=0.1,
	top_p=0.9,
	do_sample=True,
	repetition_penalty=1.05,
	)
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]
	return output_text


	def images_bytes_to_pdf_bytes(image_bytes: bytes) -> bytes:
	"""
	Converts image bytes into PDF bytes.
	"""
	pdf_buffer = BytesIO()
	image = Image.open(BytesIO(image_bytes)).convert("RGB")
	image.save(pdf_buffer, format="PDF", save_all=True)
	pdf_bytes = pdf_buffer.getvalue()
	pdf_buffer.close()
	return pdf_bytes


	def read_fn(path: str or Path) -> bytes:
	"""
	Reads a file and returns its content in bytes. Converts images to PDF bytes.
	"""
	if not isinstance(path, Path):
	path = Path(path)
	with open(str(path), "rb") as input_file:
	file_bytes = input_file.read()
	if path.suffix in image_suffixes:
	return images_bytes_to_pdf_bytes(file_bytes)
	elif path.suffix in pdf_suffixes:
	return file_bytes
	else:
	raise Exception(f"Unknown file suffix: {path.suffix}")


	def safe_stem(file_path: str) -> str:
	"""
	Creates a safe file stem from a path.
	"""
	stem = Path(file_path).stem
	return re.sub(r'[^\w.]', '_', stem)


	def to_pdf(file_path: str) -> str or None:
	"""
	Ensures the input file is in PDF format for consistent processing.
	If the input is an image, it's converted to a temporary PDF.
	"""
	if file_path is None:
	return None

	pdf_bytes = read_fn(file_path)
	unique_filename = f'{safe_stem(file_path)}.pdf'
	# Use Gradio's temp directory for temporary files
	tmp_dir = tempfile.gettempdir()
	tmp_file_path = os.path.join(tmp_dir, unique_filename)

	with open(tmp_file_path, 'wb') as tmp_pdf_file:
	tmp_pdf_file.write(pdf_bytes)

	return tmp_file_path


	async def pdf_parse(file_path: str, request: gr.Request):
	"""
	Main parsing function that orchestrates the PDF processing pipeline.
	It now extracts images directly and injects them into the final HTML.
	"""
	if file_path is None:
	logger.warning("file_path is None")
	return (
	"<p>Please upload a PDF file</p>", "", "<p>No input file</p>",
	None, None, "Error: No file provided"
	)
	logger.info(f'Processing file: {file_path}')

	tmp_pdf_path = to_pdf(file_path)
	if tmp_pdf_path is None:
	return (
	"<p>Failed to process file</p>", "", "<p>Processing error</p>",
	None, None, "Error: Failed to process file"
	)

	start_time = time.time()
	try:
	pdf_document = fitz.open(tmp_pdf_path)
	html_parts = []

	# Process each page
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	logger.info(f"Processing Page {page_num + 1}/{len(pdf_document)}")

	# --- 1. Extract images directly from the PDF page using PyMuPDF ---
	page_images_base64 = []
	img_list = page.get_images(full=True)
	for img_index, img in enumerate(img_list):
	xref = img[0]
	base_image = pdf_document.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]
	base64_string = f"data:image/{image_ext};base64,{base64.b64encode(image_bytes).decode()}"
	page_images_base64.append(base64_string)

	logger.info(f" > Found {len(page_images_base64)} images on page {page_num + 1}.")

	# --- 2. Render the page to an image for the VL-Model ---
	zoom = 200 / 72.0 # Corresponds to 200 DPI
	mat = fitz.Matrix(zoom, zoom)
	pix = page.get_pixmap(matrix=mat)
	page_image = Image.open(BytesIO(pix.tobytes("png")))

	# --- 3. Get the structured HTML from the model ---
	logger.info(f" > Parsing page layout with Logics-Parsing model...")
	html_content = parse_page(page_image)

	# --- 4. Inject extracted images back into the HTML ---
	if page_images_base64:
	logger.info(f" > Injecting {len(page_images_base64)} extracted images into generated HTML...")
	soup = BeautifulSoup(html_content, 'html.parser')
	figures = soup.find_all('figure')

	# If model identified same number of figures, inject images into them
	if len(figures) == len(page_images_base64):
	for fig, b64_img in zip(figures, page_images_base64):
	img_tag = soup.new_tag('img', src=b64_img, style="max-width:100%; height:auto;")
	fig.append(img_tag)
	else: # Otherwise, append all images at the end of the page content as a fallback
	logger.warning(f" > Mismatch: Model found {len(figures)} figures, but {len(page_images_base64)} images were extracted. Appending images to the end.")
	for b64_img in page_images_base64:
	img_tag = soup.new_tag('img', src=b64_img, style="max-width:100%; height:auto;")
	p_tag = soup.new_tag('p')
	p_tag.append(img_tag)
	soup.append(p_tag)
	html_content = str(soup)

	html_parts.append(f'<div class="page-{page_num+1}">{html_content}</div>')

	pdf_document.close()
	full_html = '\n'.join(html_parts)
	parsing_time = time.time() - start_time

	# Convert final rich HTML to Markdown
	mmd = html2text.html2text(full_html)
	mmd_html = markdown.markdown(mmd)
	qwen_html = full_html

	# Create a temporary markdown file for download
	with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
	f.write(mmd)
	md_path = f.name

	cost_time = f'Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'

	return mmd_html, mmd, qwen_html, md_path, tmp_pdf_path, cost_time

	except Exception as e:
	logger.error(f"Parsing failed: {e}")
	import traceback
	traceback.print_exc()
	return (
	"<p>Parsing failed. Please try again.</p>", "", f"<p>Error: {str(e)}</p>",
	None, None, f"Error: {str(e)}"
	)


	@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
	@click.pass_context
	def main(ctx, **kwargs):
	"""
	Sets up and launches the Gradio user interface.
	"""
	# FIX: Configure Loguru for better visibility in deployment environments
	logger.remove() # Remove default handler
	logger.add(sys.stdout, level="INFO")

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📄 Logics-Parsing Document Analysis")
	gr.Markdown("Upload a PDF or image file to parse its content into structured Markdown and HTML formats, now with improved image extraction.")
	with gr.Row():
	with gr.Column(variant='panel', scale=5):
	with gr.Row():
	input_file = gr.File(label='Please upload a PDF or image (Max 20 pages for conversion)',
	file_types=pdf_suffixes + image_suffixes)
	with gr.Row():
	change_bu = gr.Button('Convert', variant='primary')
	clear_bu = gr.ClearButton(value='Clear')
	pdf_show = PDF(label='PDF Preview', interactive=False, visible=True, height=800)

	example_root = 'parsing/examples'
	logger.info(f'Looking for examples in: {example_root}')
	if os.path.exists(example_root) and os.path.isdir(example_root):
	example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
	if example_files:
	with gr.Accordion('Examples:', open=True):
	gr.Examples(examples=example_files, inputs=input_file)

	with gr.Column(variant='panel', scale=5):
	output_file = gr.File(label='Download Markdown Result', interactive=False)
	cost_time = gr.Text(label='Time Cost', interactive=False)
	with gr.Tabs():
	with gr.Tab('Markdown Rendering'):
	mmd_html = gr.HTML(label='MMD Rendering')
	with gr.Tab('Markdown Source'):
	mmd = gr.TextArea(lines=45, show_copy_button=True, label="Markdown Source")
	with gr.Tab('Generated HTML'):
	raw_html = gr.TextArea(lines=45, show_copy_button=True, label="Generated HTML")

	components_to_clear = [input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time]
	clear_bu.add(components_to_clear)

	input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, show_progress="full")

	change_bu.click(
	fn=pdf_parse,
	inputs=[input_file],
	outputs=[mmd_html, mmd, raw_html, output_file, pdf_show, cost_time],
	concurrency_limit=15,
	show_progress="full"
	)

	demo.launch(debug=True)


	if __name__ == '__main__':
	main()