Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from pdf2image import convert_from_path | |
| from PIL import Image, ImageDraw, ImageFont | |
| import json | |
| import tempfile | |
| # Colors for annotations | |
| PARA_COLOR = (255, 0, 0) # Red | |
| DATE_COLOR = (0, 255, 0) # Green | |
| HOVER_COLOR = (255, 165, 0) # Orange | |
| POPLER_PATH = "/usr/bin" | |
| def annotate_image(page_img, paragraphs, highlight_id=None): | |
| """Draw annotations on image - assuming boxes are already for 300 DPI images""" | |
| draw = ImageDraw.Draw(page_img) | |
| # Try to load a larger font | |
| try: | |
| # Try to use a larger font size (32 instead of default) | |
| font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 32) | |
| except: | |
| try: | |
| font = ImageFont.truetype("arial.ttf", 32) | |
| except: | |
| # Fallback to default but larger | |
| font = ImageFont.load_default() | |
| # Try to make default font larger | |
| try: | |
| font = ImageFont.load_default(size=32) | |
| except: | |
| pass | |
| for para in paragraphs: | |
| x1, y1, x2, y2 = map(int, para.get('bbox', [0,0,0,0])) | |
| para_id = para.get('paragraph_id', '') | |
| # Choose color based on highlight | |
| if str(para_id) == str(highlight_id): | |
| color = HOVER_COLOR | |
| width = 3 | |
| else: | |
| color = PARA_COLOR | |
| width = 2 | |
| # Draw rectangle | |
| draw.rectangle([x1, y1, x2, y2], outline=color, width=width) | |
| # Draw paragraph ID with SAME COLOR AS BOX (not black) | |
| draw.text((x1 + 5, y1 + 5), str(para_id), fill=color, font=font) | |
| # Draw dates | |
| for date in para.get('dates', []): | |
| dx1, dy1, dx2, dy2 = map(int, date.get('bbox', [0,0,0,0])) | |
| date_id = date.get('date_id', '') | |
| draw.rectangle([dx1, dy1, dx2, dy2], outline=DATE_COLOR, width=2) | |
| # Draw date ID without "D" prefix, SAME COLOR AS BOX (not black) | |
| draw.text((dx1 + 5, dy1 + 5), str(date_id), fill=DATE_COLOR, font=font) | |
| return page_img | |
| def process_documents(pdf_path, json_path, page_num=1, highlight_id=None): | |
| """Main processing function - converts at 300 DPI to match JSON annotations""" | |
| try: | |
| # Load JSON | |
| with open(json_path, 'r') as f: | |
| data = json.load(f) | |
| # Convert PDF page at 300 DPI (to match your JSON annotations) | |
| pages = convert_from_path( | |
| pdf_path, | |
| dpi=300, # CHANGED FROM 150 TO 300 | |
| first_page=page_num, | |
| last_page=page_num, | |
| poppler_path=POPLER_PATH | |
| ) | |
| if not pages: | |
| return None, "No content", 1, "1" | |
| page_img = pages[0] | |
| # Get PDF name for JSON lookup | |
| pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] | |
| page_key = f"{pdf_name}_page_{page_num}" | |
| # Get paragraphs for this page | |
| paragraphs = data.get(page_key, {}).get("paragraphs", []) | |
| # Annotate image | |
| annotated_img = annotate_image(page_img.copy(), paragraphs, highlight_id) | |
| # Save temp image | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png") | |
| annotated_img.save(temp_file.name) | |
| # Generate side panel HTML (text) - ALL TEXT IN BLACK (SIDE PANEL ONLY) with scrollable container | |
| text_content = f""" | |
| <div style=" | |
| height: 600px; | |
| overflow-y: auto; | |
| padding-right: 10px; | |
| font-family: Arial, sans-serif; | |
| "> | |
| <h3 style='color:black; margin-top: 0;'>Page {page_num}</h3> | |
| """ | |
| for para in paragraphs: | |
| para_id = para.get("paragraph_id", "") | |
| llm_text = para.get("llm_text", "No text") | |
| if str(para_id) == str(highlight_id): | |
| para_style = "background-color: #fff3cd; padding: 10px; margin: 5px; border-left: 4px solid #ffc107;" | |
| else: | |
| para_style = "background-color: #f8f9fa; padding: 10px; margin: 5px; border-left: 2px solid #ddd;" | |
| text_content += f""" | |
| <div style="{para_style}"> | |
| <div style="font-weight: bold; color: black;">Paragraph {para_id}</div> | |
| <div style="margin: 8px 0; color: black;">{llm_text}</div> | |
| """ | |
| # Add dates if present - ALL TEXT IN BLACK (SIDE PANEL ONLY) | |
| dates = para.get("dates", []) | |
| if dates: | |
| text_content += """ | |
| <div style="background-color: #e8f5e9; padding: 8px; margin-top: 8px; border-radius: 4px;"> | |
| <div style="font-weight: bold; margin-bottom: 5px; color: black;">Dates:</div> | |
| <div style="display: flex; flex-wrap: wrap; gap: 5px;"> | |
| """ | |
| for date in dates: | |
| date_id = date.get("date_id", "") | |
| transcription = date.get("transcription", "N/A") | |
| confidence = date.get("confidence", 0) | |
| text_content += f""" | |
| <div style="background-color: white; padding: 4px 8px; border: 1px solid #c8e6c9; border-radius: 3px;"> | |
| <span style="font-weight: bold; color: black;">Date {date_id}:</span> | |
| <span style="margin-left: 5px; color: black;">{transcription}</span> | |
| <span style="margin-left: 5px; font-size: 0.8em; color: #666;">({confidence:.1%})</span> | |
| </div> | |
| """ | |
| text_content += "</div></div>" | |
| text_content += "</div>" | |
| text_content += "</div>" # Close scrollable container | |
| return temp_file.name, text_content, page_num, "?" | |
| except Exception as e: | |
| return None, f"<div style='color: red;'>Error: {str(e)}</div>", page_num, "1" | |
| def get_total_pages(pdf_path): | |
| """Get total pages in PDF""" | |
| try: | |
| from pdf2image import pdfinfo_from_path | |
| info = pdfinfo_from_path(pdf_path, poppler_path=POPLER_PATH) | |
| return info["Pages"] | |
| except Exception as e: | |
| print("PDF info error:", e) | |
| return 1 | |
| # ---------------- GRADIO INTERFACE ---------------- # | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# PDF Annotation Viewer") | |
| gr.Markdown("Upload a PDF and JSON file to view annotations") | |
| current_page = gr.State(1) | |
| current_pdf = gr.State(None) | |
| current_json = gr.State(None) | |
| total_pages = gr.State(1) | |
| with gr.Row(): | |
| pdf_input = gr.File(label="PDF File", file_types=[".pdf"]) | |
| json_input = gr.File(label="JSON File", file_types=[".json"]) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| with gr.Row(): | |
| prev_btn = gr.Button("← Prev") | |
| page_display = gr.Number(value=1, label="Page", minimum=1, interactive=True) | |
| next_btn = gr.Button("Next →") | |
| total_display = gr.Textbox("?", label="of", interactive=False) | |
| image_output = gr.Image(label="Annotated Page", type="filepath", height=600) | |
| with gr.Column(scale=1): | |
| # Text output with fixed height to match image | |
| text_output = gr.HTML( | |
| label="Text Content", | |
| value="<div style='height: 600px; overflow-y: auto; padding: 20px; text-align: center; color: #666;'>Upload files to begin</div>" | |
| ) | |
| highlight_input = gr.Textbox(visible=False) | |
| # ----------------- FUNCTIONS ----------------- # | |
| def load_files(pdf_file, json_file): | |
| if not pdf_file or not json_file: | |
| return None, "Please upload both files", 1, "?", None, None, 1 | |
| try: | |
| total = get_total_pages(pdf_file.name) | |
| image_path, text, page_num, _ = process_documents(pdf_file.name, json_file.name, 1) | |
| return image_path, text, 1, str(total), pdf_file, json_file, total | |
| except Exception as e: | |
| return None, f"Error loading files: {str(e)}", 1, "?", None, None, 1 | |
| def update_page(page_num, pdf_file, json_file, highlight=None): | |
| if not pdf_file or not json_file: | |
| return None, "Files not loaded", page_num, "?" | |
| image_path, text, _, _ = process_documents(pdf_file.name, json_file.name, page_num, highlight) | |
| return image_path, text, page_num, "" | |
| def navigate(direction, current, total, pdf_file, json_file, highlight): | |
| current = int(current) | |
| total = int(total) | |
| if direction == "next" and current < total: | |
| new_page = current + 1 | |
| elif direction == "prev" and current > 1: | |
| new_page = current - 1 | |
| else: | |
| new_page = current | |
| return update_page(new_page, pdf_file, json_file, highlight) | |
| def text_click(evt: gr.SelectData): | |
| return str(evt.index) if evt.index else "" | |
| # ----------------- EVENTS ----------------- # | |
| pdf_input.change( | |
| load_files, | |
| inputs=[pdf_input, json_input], | |
| outputs=[image_output, text_output, current_page, total_display, current_pdf, current_json, total_pages] | |
| ) | |
| json_input.change( | |
| load_files, | |
| inputs=[pdf_input, json_input], | |
| outputs=[image_output, text_output, current_page, total_display, current_pdf, current_json, total_pages] | |
| ) | |
| text_output.select( | |
| text_click, | |
| None, | |
| highlight_input | |
| ).then( | |
| update_page, | |
| [current_page, current_pdf, current_json, highlight_input], | |
| [image_output, text_output, page_display, highlight_input] | |
| ) | |
| prev_btn.click( | |
| navigate, | |
| inputs=[gr.Text("prev", visible=False), current_page, total_pages, current_pdf, current_json, highlight_input], | |
| outputs=[image_output, text_output, page_display, highlight_input] | |
| ) | |
| next_btn.click( | |
| navigate, | |
| inputs=[gr.Text("next", visible=False), current_page, total_pages, current_pdf, current_json, highlight_input], | |
| outputs=[image_output, text_output, page_display, highlight_input] | |
| ) | |
| page_display.change( | |
| update_page, | |
| inputs=[page_display, current_pdf, current_json, highlight_input], | |
| outputs=[image_output, text_output, current_page, highlight_input] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |