Spaces:

Cyberlgl
/

CyberLegalAIendpoint

Sleeping

File size: 5,117 Bytes
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test Mistral OCR\n",
    "\n",
    "This notebook tests the Mistral OCR API to understand how it works with scanned PDFs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from mistralai import Mistral\n",
    "import base64\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "# Initialize Mistral client\n",
    "api_key = os.getenv(\"MISTRAL_API_KEY\")\n",
    "if not api_key:\n",
    "    print(\"❌ MISTRAL_API_KEY not found in .env\")\n",
    "else:\n",
    "    print(f\"✅ Mistral API key loaded\")\n",
    "\n",
    "client = Mistral(api_key=api_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test PDF path\n",
    "pdf_path = \"PublicWaterMassMailing.pdf\"\n",
    "\n",
    "if os.path.exists(pdf_path):\n",
    "    print(f\"✅ PDF found: {pdf_path}\")\n",
    "    file_size = os.path.getsize(pdf_path) / 1024\n",
    "    print(f\"   File size: {file_size:.2f} KB\")\n",
    "else:\n",
    "    print(f\"❌ PDF not found: {pdf_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Method 1: Test with base64 encoded PDF\n",
    "with open(pdf_path, 'rb') as f:\n",
    "    pdf_bytes = f.read()\n",
    "    pdf_b64 = base64.b64encode(pdf_bytes).decode()\n",
    "\n",
    "print(f\"PDF encoded to base64: {len(pdf_b64)} characters\")\n",
    "\n",
    "try:\n",
    "    result = client.ocr.process(\n",
    "        model=\"mistral-ocr-latest\",\n",
    "        document={\n",
    "            \"type\": \"document_url\",\n",
    "            \"document_url\": f\"data:application/pdf;base64,{pdf_b64}\"\n",
    "        }\n",
    "    )\n",
    "    \n",
    "    print(\"\\n✅ OCR successful!\")\n",
    "    print(f\"\\nModel used: {result.model}\")\n",
    "    print(f\"Number of pages: {len(result.pages)}\")\n",
    "    \n",
    "except Exception as e:\n",
    "    print(f\"\\n❌ OCR failed: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore the result structure\n",
    "if 'result' in locals():\n",
    "    print(\"\\n=== Result Structure ===\")\n",
    "    print(f\"Type: {type(result)}\")\n",
    "    print(f\"\\nResult attributes: {dir(result)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore pages structure\n",
    "if 'result' in locals():\n",
    "    print(\"\\n=== Pages Structure ===\")\n",
    "    for i, page in enumerate(result.pages):\n",
    "        print(f\"\\nPage {i}:\")\n",
    "        print(f\"  Type: {type(page)}\")\n",
    "        print(f\"  Attributes: {dir(page)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract markdown text from pages\n",
    "if 'result' in locals():\n",
    "    print(\"\\n=== Extracted Text ===\")\n",
    "    \n",
    "    for i, page in enumerate(result.pages):\n",
    "        print(f\"\\n--- Page {i} ---\")\n",
    "        if hasattr(page, 'markdown'):\n",
    "            print(page.markdown[:500])  # First 500 chars\n",
    "            print(f\"\\nTotal chars: {len(page.markdown)}\")\n",
    "        else:\n",
    "            print(\"No markdown attribute found\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combine all pages\n",
    "if 'result' in locals():\n",
    "    full_text = \"\\n\\n\".join([p.markdown for p in result.pages])\n",
    "    \n",
    "    print(f\"\\n=== Full Document ===\")\n",
    "    print(f\"Total pages: {len(result.pages)}\")\n",
    "    print(f\"Total characters: {len(full_text)}\")\n",
    "    \n",
    "    # Save to file\n",
    "    with open(\"ocr_output.txt\", \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(full_text)\n",
    "    \n",
    "    print(\"\\n✅ Full text saved to ocr_output.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show usage info\n",
    "if 'result' in locals():\n",
    "    print(\"\\n=== Usage Info ===\")\n",
    "    if hasattr(result, 'usage_info'):\n",
    "        print(result.usage_info)\n",
    "    else:\n",
    "        print(\"No usage_info attribute\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cyberlgl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}