Spaces:
Build error
Build error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import os | |
| import re | |
| from datetime import datetime | |
| def pdf_to_kindle_md(pdf_file, image_quality): | |
| # Create output directory | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_dir = f"output_{timestamp}" | |
| os.makedirs(output_dir, exist_ok=True) | |
| doc = fitz.open(pdf_file.name) | |
| md_content = [] | |
| image_count = 1 | |
| for page_num, page in enumerate(doc): | |
| # Extract text with formatting | |
| blocks = page.get_text("dict")["blocks"] | |
| for b in blocks: | |
| for l in b.get("lines", []): | |
| for s in l.get("spans", []): | |
| text = s["text"] | |
| font_flags = s["flags"] | |
| # Detect bold text (using font flags) | |
| is_bold = font_flags & 2 ** 4 # 16 is the bold flag | |
| # Detect headings (heuristic based on font size) | |
| font_size = s["size"] | |
| if font_size >= 20: | |
| md_content.append(f"\n# {text}\n") | |
| elif font_size >= 16: | |
| md_content.append(f"\n## {text}\n") | |
| elif font_size >= 14: | |
| md_content.append(f"\n### {text}\n") | |
| else: | |
| if is_bold: | |
| md_content.append(f"**{text}** ") | |
| else: | |
| md_content.append(f"{text} ") | |
| md_content.append("\n") | |
| # Extract images | |
| image_list = page.get_images() | |
| for img_index, img in enumerate(image_list): | |
| xref = img[0] | |
| base_image = doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_ext = base_image["ext"] | |
| image_path = os.path.join(output_dir, f"image_{image_count}.{image_ext}") | |
| with open(image_path, "wb") as image_file: | |
| image_file.write(image_bytes) | |
| md_content.append(f"\n\n\n") | |
| image_count += 1 | |
| md_content.append("\n\n") | |
| # Post-processing | |
| final_md = "".join(md_content) | |
| # Clean up excessive newlines | |
| final_md = re.sub(r'\n{3,}', '\n\n', final_md) | |
| # Improve list formatting | |
| final_md = re.sub(r'(\d+)\. ', r'\n\1. ', final_md) | |
| # Remove trailing spaces | |
| final_md = re.sub(r' +(\n)', r'\1', final_md) | |
| # Save markdown file | |
| md_path = os.path.join(output_dir, "converted.md") | |
| with open(md_path, "w", encoding="utf-8") as md_file: | |
| md_file.write(final_md) | |
| return final_md, output_dir | |
| def create_interface(): | |
| with gr.Blocks() as app: | |
| gr.Markdown("# PDF to Kindle-Friendly Markdown Converter") | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf_input = gr.File(label="Upload PDF File", type="file") | |
| image_quality = gr.Slider(1, 10, value=7, label="Image Quality (1-10)") | |
| convert_btn = gr.Button("Convert to Markdown") | |
| with gr.Column(): | |
| md_output = gr.Textbox(label="Markdown Preview", interactive=False, lines=20) | |
| download_group = gr.File(label="Download Files") | |
| convert_btn.click( | |
| fn=pdf_to_kindle_md, | |
| inputs=[pdf_input, image_quality], | |
| outputs=[md_output, download_group] | |
| ) | |
| gr.Examples( | |
| examples=[["sample_report.pdf", 7]], | |
| inputs=[pdf_input, image_quality] | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch() |