Spaces:
Sleeping
Sleeping
File size: 5,117 Bytes
6853143 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | {
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test Mistral OCR\n",
"\n",
"This notebook tests the Mistral OCR API to understand how it works with scanned PDFs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from dotenv import load_dotenv\n",
"from mistralai import Mistral\n",
"import base64\n",
"\n",
"load_dotenv()\n",
"\n",
"# Initialize Mistral client\n",
"api_key = os.getenv(\"MISTRAL_API_KEY\")\n",
"if not api_key:\n",
" print(\"❌ MISTRAL_API_KEY not found in .env\")\n",
"else:\n",
" print(f\"✅ Mistral API key loaded\")\n",
"\n",
"client = Mistral(api_key=api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test PDF path\n",
"pdf_path = \"PublicWaterMassMailing.pdf\"\n",
"\n",
"if os.path.exists(pdf_path):\n",
" print(f\"✅ PDF found: {pdf_path}\")\n",
" file_size = os.path.getsize(pdf_path) / 1024\n",
" print(f\" File size: {file_size:.2f} KB\")\n",
"else:\n",
" print(f\"❌ PDF not found: {pdf_path}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Method 1: Test with base64 encoded PDF\n",
"with open(pdf_path, 'rb') as f:\n",
" pdf_bytes = f.read()\n",
" pdf_b64 = base64.b64encode(pdf_bytes).decode()\n",
"\n",
"print(f\"PDF encoded to base64: {len(pdf_b64)} characters\")\n",
"\n",
"try:\n",
" result = client.ocr.process(\n",
" model=\"mistral-ocr-latest\",\n",
" document={\n",
" \"type\": \"document_url\",\n",
" \"document_url\": f\"data:application/pdf;base64,{pdf_b64}\"\n",
" }\n",
" )\n",
" \n",
" print(\"\\n✅ OCR successful!\")\n",
" print(f\"\\nModel used: {result.model}\")\n",
" print(f\"Number of pages: {len(result.pages)}\")\n",
" \n",
"except Exception as e:\n",
" print(f\"\\n❌ OCR failed: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Explore the result structure\n",
"if 'result' in locals():\n",
" print(\"\\n=== Result Structure ===\")\n",
" print(f\"Type: {type(result)}\")\n",
" print(f\"\\nResult attributes: {dir(result)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Explore pages structure\n",
"if 'result' in locals():\n",
" print(\"\\n=== Pages Structure ===\")\n",
" for i, page in enumerate(result.pages):\n",
" print(f\"\\nPage {i}:\")\n",
" print(f\" Type: {type(page)}\")\n",
" print(f\" Attributes: {dir(page)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extract markdown text from pages\n",
"if 'result' in locals():\n",
" print(\"\\n=== Extracted Text ===\")\n",
" \n",
" for i, page in enumerate(result.pages):\n",
" print(f\"\\n--- Page {i} ---\")\n",
" if hasattr(page, 'markdown'):\n",
" print(page.markdown[:500]) # First 500 chars\n",
" print(f\"\\nTotal chars: {len(page.markdown)}\")\n",
" else:\n",
" print(\"No markdown attribute found\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Combine all pages\n",
"if 'result' in locals():\n",
" full_text = \"\\n\\n\".join([p.markdown for p in result.pages])\n",
" \n",
" print(f\"\\n=== Full Document ===\")\n",
" print(f\"Total pages: {len(result.pages)}\")\n",
" print(f\"Total characters: {len(full_text)}\")\n",
" \n",
" # Save to file\n",
" with open(\"ocr_output.txt\", \"w\", encoding=\"utf-8\") as f:\n",
" f.write(full_text)\n",
" \n",
" print(\"\\n✅ Full text saved to ocr_output.txt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show usage info\n",
"if 'result' in locals():\n",
" print(\"\\n=== Usage Info ===\")\n",
" if hasattr(result, 'usage_info'):\n",
" print(result.usage_info)\n",
" else:\n",
" print(\"No usage_info attribute\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "cyberlgl",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
|