PDF-Parser

Running

App Files Files Community

saifisvibinn commited on Nov 18, 2025

Commit

cd71fbc

1 Parent(s): 27a4694

Add /api/predict endpoint implementation

Browse files

Files changed (1) hide show

app.py +190 -0

app.py CHANGED Viewed

@@ -89,6 +89,196 @@ def api_docs():
     return render_template('api_docs.html', routes=routes, base_url=base_url)
 @app.route('/api/device-info')
 def device_info():
     """API endpoint to get device information."""

     return render_template('api_docs.html', routes=routes, base_url=base_url)
+@app.route('/api/predict', methods=['POST'])
+def predict():
+    """
+    Clean REST API endpoint for PDF extraction.
+    Accepts a PDF file and returns extracted text, tables, and figures.
+    Request:
+        - Method: POST
+        - Content-Type: multipart/form-data
+        - Body: file (PDF file)
+    Response:
+        {
+            "status": "success",
+            "filename": "document.pdf",
+            "text": "extracted markdown text...",
+            "tables": [...],
+            "figures": [...],
+            "summary": {...}
+        }
+    """
+    try:
+        # Check if file is present
+        if 'file' not in request.files:
+            return jsonify({
+                'status': 'error',
+                'error': 'No file provided. Please upload a PDF file using the "file" field.'
+            }), 400
+        file = request.files['file']
+        if file.filename == '':
+            return jsonify({
+                'status': 'error',
+                'error': 'No file selected'
+            }), 400
+        if not file.filename.lower().endswith('.pdf'):
+            return jsonify({
+                'status': 'error',
+                'error': 'Invalid file type. Please upload a PDF file.'
+            }), 400
+        filename = secure_filename(file.filename)
+        stem = Path(filename).stem
+        # Create temporary directories for processing
+        temp_upload = Path(app.config['UPLOAD_FOLDER']) / f"temp_{uuid.uuid4().hex}"
+        temp_output = Path(app.config['OUTPUT_FOLDER']) / f"temp_{uuid.uuid4().hex}"
+        temp_upload.parent.mkdir(parents=True, exist_ok=True)
+        temp_output.mkdir(parents=True, exist_ok=True)
+        try:
+            # Save uploaded file
+            pdf_path = temp_upload / filename
+            file_data = file.read()
+            pdf_path.write_bytes(file_data)
+            # Load model if needed
+            load_model_once()
+            # Process PDF (extract both images and markdown)
+            extractor.USE_MULTIPROCESSING = False
+            extractor.process_pdf_with_pool(
+                pdf_path,
+                temp_output,
+                pool=None,
+                extract_images=True,
+                extract_markdown=True,
+            )
+            # Collect extracted data
+            result = {
+                'status': 'success',
+                'filename': filename,
+                'text': '',
+                'tables': [],
+                'figures': [],
+                'summary': {
+                    'total_pages': 0,
+                    'figures_count': 0,
+                    'tables_count': 0,
+                    'elements_count': 0
+                }
+            }
+            # Extract markdown text
+            markdown_path = temp_output / f"{stem}.md"
+            if markdown_path.exists():
+                result['text'] = markdown_path.read_text(encoding='utf-8')
+            # Extract figures and tables from JSON
+            json_path = temp_output / f"{stem}_content_list.json"
+            if json_path.exists():
+                elements = json.loads(json_path.read_text(encoding='utf-8'))
+                figures = [e for e in elements if e.get('type') == 'figure']
+                tables = [e for e in elements if e.get('type') == 'table']
+                # Get page count
+                try:
+                    import pypdfium2 as pdfium
+                    pdf_bytes = pdf_path.read_bytes()
+                    doc = pdfium.PdfDocument(pdf_bytes)
+                    result['summary']['total_pages'] = len(doc)
+                    doc.close()
+                except:
+                    pass
+                # Format figures
+                for fig in figures:
+                    figure_data = {
+                        'page': fig.get('page', 0),
+                        'bbox': fig.get('bbox_pixels', []),
+                        'confidence': fig.get('conf', 0.0),
+                        'width': fig.get('width', 0),
+                        'height': fig.get('height', 0),
+                    }
+                    # Include image path if available
+                    if fig.get('image_path'):
+                        img_path = temp_output / fig['image_path']
+                        if img_path.exists():
+                            # Convert image to base64 for API response
+                            from PIL import Image
+                            import io
+                            img = Image.open(img_path)
+                            img_buffer = io.BytesIO()
+                            img.save(img_buffer, format='PNG')
+                            img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
+                            figure_data['image_base64'] = f"data:image/png;base64,{img_base64}"
+                            figure_data['image_path'] = fig['image_path']
+                    result['figures'].append(figure_data)
+                # Format tables
+                for tab in tables:
+                    table_data = {
+                        'page': tab.get('page', 0),
+                        'bbox': tab.get('bbox_pixels', []),
+                        'confidence': tab.get('conf', 0.0),
+                        'width': tab.get('width', 0),
+                        'height': tab.get('height', 0),
+                    }
+                    # Include image path if available
+                    if tab.get('image_path'):
+                        img_path = temp_output / tab['image_path']
+                        if img_path.exists():
+                            # Convert image to base64 for API response
+                            from PIL import Image
+                            import io
+                            img = Image.open(img_path)
+                            img_buffer = io.BytesIO()
+                            img.save(img_buffer, format='PNG')
+                            img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
+                            table_data['image_base64'] = f"data:image/png;base64,{img_base64}"
+                            table_data['image_path'] = tab['image_path']
+                    result['tables'].append(table_data)
+                result['summary']['figures_count'] = len(figures)
+                result['summary']['tables_count'] = len(tables)
+                result['summary']['elements_count'] = len(elements)
+            return jsonify(result)
+        finally:
+            # Clean up temporary files
+            try:
+                if temp_upload.exists():
+                    if temp_upload.is_file():
+                        temp_upload.unlink()
+                    else:
+                        shutil.rmtree(temp_upload, ignore_errors=True)
+                if temp_output.exists():
+                    shutil.rmtree(temp_output, ignore_errors=True)
+            except Exception as e:
+                logger.warning(f"Error cleaning up temp files: {e}")
+    except Exception as e:
+        logger.error(f"Error in /api/predict: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return jsonify({
+            'status': 'error',
+            'error': str(e)
+        }), 500
 @app.route('/api/device-info')
 def device_info():
     """API endpoint to get device information."""