Spaces:

Ansemin101
/

Markit

Paused

App Files Files Community

AnseMin commited on Mar 5, 2025

Commit

2dc4c21

1 Parent(s): a370b95

adding gemini flash

Browse files

Files changed (8) hide show

.gitignore +9 -1
README.md +16 -2
build.sh +5 -0
requirements.txt +3 -1
setup.sh +5 -0
src/parsers/__init__.py +1 -0
src/parsers/gemini_flash_parser.py +175 -0
tests/__init__.py +0 -1

.gitignore CHANGED Viewed

@@ -35,4 +35,12 @@ Thumbs.db
 # Specific files to ignore
 README_HF.md
 requirement.txt
-.env_example

 # Specific files to ignore
 README_HF.md
 requirement.txt
+.env_example
+# Ignore documents folder
+/documents/
+/documents/*
+# Ignore tessdata folder
+/tessdata/
+/tessdata/*

README.md CHANGED Viewed

@@ -18,7 +18,7 @@ Doc2Md is a tool that converts various document formats (PDF, DOCX, etc.) to Mar
 ## Features
 - Convert documents to Markdown, JSON, Text, or Document Tags format
-- Multiple parsing engines: PyPdfium, Docling, and Marker
 - Various OCR options depending on the selected parser
 - Page navigation for large documents
 - Chat with your documents using AI
@@ -30,12 +30,26 @@ Doc2Md is a tool that converts various document formats (PDF, DOCX, etc.) to Mar
 pip install -e .
 ```
 ## How to Use
 ### 1. Upload and Convert
 - Upload your document using the file uploader
-- Select a parser provider (PyPdfium, Docling, or Marker)
 - Choose an OCR option based on your selected provider
 - Select your desired output format
 - Click "Convert" to process your document
 - Navigate through pages using the arrow buttons

 ## Features
 - Convert documents to Markdown, JSON, Text, or Document Tags format
+- Multiple parsing engines: PyPdfium, Docling, Marker, and Gemini Flash
 - Various OCR options depending on the selected parser
 - Page navigation for large documents
 - Chat with your documents using AI
 pip install -e .
 ```
+### Gemini Flash Parser
+To use the Gemini Flash parser, you need to:
+1. Install the Google Gemini API client: `pip install google-genai`
+2. Set the `GOOGLE_API_KEY` environment variable with your Gemini API key
+   ```bash
+   # On Windows
+   set GOOGLE_API_KEY=your_api_key_here
+   # On Linux/Mac
+   export GOOGLE_API_KEY=your_api_key_here
+   ```
+3. You can obtain a Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
 ## How to Use
 ### 1. Upload and Convert
 - Upload your document using the file uploader
+- Select a parser provider (PyPdfium, Docling, Marker, or Gemini Flash)
 - Choose an OCR option based on your selected provider
+  - Note: Gemini Flash automatically handles OCR, so no OCR option is needed
 - Select your desired output format
 - Click "Convert" to process your document
 - Navigate through pages using the arrow buttons

build.sh CHANGED Viewed

@@ -73,6 +73,11 @@ print(f'Available languages: {tesserocr.get_languages()}')
 print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
 "
 # Install Python dependencies
 echo "Installing Python dependencies..."
 pip install -e .

 print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
 "
+# Install Google Gemini API client
+echo "Installing Google Gemini API client..."
+pip install -q -U google-genai
+echo "Google Gemini API client installed successfully"
 # Install Python dependencies
 echo "Installing Python dependencies..."
 pip install -e .

requirements.txt CHANGED Viewed

@@ -16,4 +16,6 @@ tesserocr>=2.5.0; platform_system != "Windows"  # Only install on non-Windows sy
 # Additional dependencies for image processing
 opencv-python-headless>=4.5.0  # Headless version for server environments
 pdf2image>=1.16.0  # For PDF processing
-dill==0.3.8  # Downgraded to be compatible with datasets

 # Additional dependencies for image processing
 opencv-python-headless>=4.5.0  # Headless version for server environments
 pdf2image>=1.16.0  # For PDF processing
+dill==0.3.8  # Downgraded to be compatible with datasets
+# Gemini API client
+google-genai>=0.1.0

setup.sh CHANGED Viewed

@@ -5,6 +5,11 @@ set -e
 echo "Setting up Tesseract OCR environment..."
 # Create tessdata directory if it doesn't exist
 mkdir -p tessdata

 echo "Setting up Tesseract OCR environment..."
+# Install google-genai package
+echo "Installing Google Gemini API client..."
+pip install -q -U google-genai
+echo "Google Gemini API client installed successfully"
 # Create tessdata directory if it doesn't exist
 mkdir -p tessdata

src/parsers/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from src.parsers.docling_parser import DoclingParser
 from src.parsers.marker_parser import MarkerParser
 from src.parsers.pypdfium_parser import PyPdfiumParser
 # You can add new parsers here in the future

 from src.parsers.docling_parser import DoclingParser
 from src.parsers.marker_parser import MarkerParser
 from src.parsers.pypdfium_parser import PyPdfiumParser
+from src.parsers.gemini_flash_parser import GeminiFlashParser
 # You can add new parsers here in the future

src/parsers/gemini_flash_parser.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from pathlib import Path
+from typing import Dict, List, Optional, Any, Union
+import os
+import json
+import tempfile
+import base64
+from PIL import Image
+import io
+from src.parsers.parser_interface import DocumentParser
+from src.parsers.parser_registry import ParserRegistry
+# Import the Google Gemini API client
+try:
+    from google import genai
+    from google.genai import types
+    GEMINI_AVAILABLE = True
+except ImportError:
+    GEMINI_AVAILABLE = False
+class GeminiFlashParser(DocumentParser):
+    """Parser implementation using Gemini Flash 2.0."""
+    @classmethod
+    def get_name(cls) -> str:
+        return "Gemini Flash"
+    @classmethod
+    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
+        return [
+            {
+                "id": "none",
+                "name": "None",
+                "default_params": {}
+            }
+        ]
+    @classmethod
+    def get_description(cls) -> str:
+        return "Gemini Flash 2.0 parser for converting documents and images to markdown"
+    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
+        """Parse a document using Gemini Flash 2.0."""
+        if not GEMINI_AVAILABLE:
+            raise ImportError(
+                "The Google Gemini API client is not installed. "
+                "Please install it with 'pip install google-genai'."
+            )
+        # Get API key from environment variable
+        api_key = os.environ.get("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "GOOGLE_API_KEY environment variable is not set. "
+                "Please set it to your Gemini API key."
+            )
+        # Initialize the Gemini client
+        client = genai.Client(api_key=api_key)
+        # Determine file type based on extension
+        file_path = Path(file_path)
+        file_extension = file_path.suffix.lower()
+        # Read the file content
+        file_content = file_path.read_bytes()
+        # Determine MIME type based on file extension
+        mime_type = self._get_mime_type(file_extension)
+        # Create system prompt for document conversion
+        system_prompt = (
+            "You are an expert document converter that transforms documents into well-formatted markdown. "
+            "Preserve the original structure, formatting, and content as accurately as possible. "
+            "Include headers, lists, tables, and other formatting elements appropriately in markdown syntax. "
+            "Ignore watermarks, page numbers, and other non-content elements."
+        )
+        # Create user prompt for document conversion
+        user_prompt = "Convert the following document to markdown (.md file) format, preserving its structure and formatting."
+        try:
+            # For smaller files (< 20MB), use inline data
+            if len(file_content) < 20 * 1024 * 1024:  # 20MB
+                # Create a Part object from the file content
+                file_part = types.Part.from_bytes(data=file_content, mime_type=mime_type)
+                # Generate content with the updated format
+                response = client.models.generate_content(
+                    model="gemini-2.0-flash",
+                    contents=[
+                        system_prompt,
+                        [file_part, user_prompt]
+                    ]
+                )
+            else:
+                # For larger files, use the File API
+                uploaded_file = client.files.upload(
+                    file=io.BytesIO(file_content),
+                    config=dict(mime_type=mime_type)
+                )
+                # Generate content with the updated format
+                response = client.models.generate_content(
+                    model="gemini-2.0-flash",
+                    contents=[
+                        system_prompt,
+                        [uploaded_file, user_prompt]
+                    ]
+                )
+            # Format the content based on the requested output format
+            output_format = kwargs.get("output_format", "markdown")
+            content = response.text
+            if output_format.lower() == "json":
+                return json.dumps({"content": content}, ensure_ascii=False, indent=2)
+            elif output_format.lower() == "text":
+                # Remove markdown formatting for plain text
+                return content.replace("#", "").replace("*", "").replace("_", "")
+            elif output_format.lower() == "document_tags":
+                return f"<doc>\n{content}\n</doc>"
+            else:
+                return content
+        except Exception as e:
+            raise Exception(f"Error processing document with Gemini Flash: {str(e)}")
+    def _get_mime_type(self, file_extension: str) -> str:
+        """Get the MIME type based on file extension."""
+        mime_types = {
+            ".pdf": "application/pdf",
+            ".txt": "text/plain",
+            ".html": "text/html",
+            ".htm": "text/html",
+            ".xml": "text/xml",
+            ".csv": "text/csv",
+            ".md": "text/markdown",
+            ".rtf": "text/rtf",
+            ".js": "application/javascript",
+            ".py": "text/x-python",
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".bmp": "image/bmp",
+            ".webp": "image/webp",
+            ".tiff": "image/tiff",
+            ".tif": "image/tiff",
+            # Add support for Office documents
+            ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            ".xls": "application/vnd.ms-excel",
+            ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            ".doc": "application/msword",
+            ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            ".ppt": "application/vnd.ms-powerpoint",
+            # Add support for other common document types
+            ".json": "application/json",
+            ".yaml": "application/x-yaml",
+            ".yml": "application/x-yaml",
+            ".tex": "application/x-tex",
+            ".odt": "application/vnd.oasis.opendocument.text",
+            ".ods": "application/vnd.oasis.opendocument.spreadsheet",
+            ".odp": "application/vnd.oasis.opendocument.presentation",
+        }
+        return mime_types.get(file_extension, "application/pdf")  # Default to PDF if unknown
+# Register the parser with the registry
+if GEMINI_AVAILABLE:
+    ParserRegistry.register(GeminiFlashParser)
+else:
+    print("Gemini Flash parser not registered: google-genai package not installed")

tests/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Test suite for the application."""