AnseMin commited on
Commit
2dc4c21
·
1 Parent(s): a370b95

adding gemini flash

Browse files
.gitignore CHANGED
@@ -35,4 +35,12 @@ Thumbs.db
35
  # Specific files to ignore
36
  README_HF.md
37
  requirement.txt
38
- .env_example
 
 
 
 
 
 
 
 
 
35
  # Specific files to ignore
36
  README_HF.md
37
  requirement.txt
38
+ .env_example
39
+
40
+ # Ignore documents folder
41
+ /documents/
42
+ /documents/*
43
+
44
+ # Ignore tessdata folder
45
+ /tessdata/
46
+ /tessdata/*
README.md CHANGED
@@ -18,7 +18,7 @@ Doc2Md is a tool that converts various document formats (PDF, DOCX, etc.) to Mar
18
 
19
  ## Features
20
  - Convert documents to Markdown, JSON, Text, or Document Tags format
21
- - Multiple parsing engines: PyPdfium, Docling, and Marker
22
  - Various OCR options depending on the selected parser
23
  - Page navigation for large documents
24
  - Chat with your documents using AI
@@ -30,12 +30,26 @@ Doc2Md is a tool that converts various document formats (PDF, DOCX, etc.) to Mar
30
  pip install -e .
31
  ```
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  ## How to Use
34
 
35
  ### 1. Upload and Convert
36
  - Upload your document using the file uploader
37
- - Select a parser provider (PyPdfium, Docling, or Marker)
38
  - Choose an OCR option based on your selected provider
 
39
  - Select your desired output format
40
  - Click "Convert" to process your document
41
  - Navigate through pages using the arrow buttons
 
18
 
19
  ## Features
20
  - Convert documents to Markdown, JSON, Text, or Document Tags format
21
+ - Multiple parsing engines: PyPdfium, Docling, Marker, and Gemini Flash
22
  - Various OCR options depending on the selected parser
23
  - Page navigation for large documents
24
  - Chat with your documents using AI
 
30
  pip install -e .
31
  ```
32
 
33
+ ### Gemini Flash Parser
34
+ To use the Gemini Flash parser, you need to:
35
+ 1. Install the Google Gemini API client: `pip install google-genai`
36
+ 2. Set the `GOOGLE_API_KEY` environment variable with your Gemini API key
37
+ ```bash
38
+ # On Windows
39
+ set GOOGLE_API_KEY=your_api_key_here
40
+
41
+ # On Linux/Mac
42
+ export GOOGLE_API_KEY=your_api_key_here
43
+ ```
44
+ 3. You can obtain a Gemini API key from [Google AI Studio](https://aistudio.google.com/app/apikey)
45
+
46
  ## How to Use
47
 
48
  ### 1. Upload and Convert
49
  - Upload your document using the file uploader
50
+ - Select a parser provider (PyPdfium, Docling, Marker, or Gemini Flash)
51
  - Choose an OCR option based on your selected provider
52
+ - Note: Gemini Flash automatically handles OCR, so no OCR option is needed
53
  - Select your desired output format
54
  - Click "Convert" to process your document
55
  - Navigate through pages using the arrow buttons
build.sh CHANGED
@@ -73,6 +73,11 @@ print(f'Available languages: {tesserocr.get_languages()}')
73
  print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
74
  "
75
 
 
 
 
 
 
76
  # Install Python dependencies
77
  echo "Installing Python dependencies..."
78
  pip install -e .
 
73
  print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
74
  "
75
 
76
+ # Install Google Gemini API client
77
+ echo "Installing Google Gemini API client..."
78
+ pip install -q -U google-genai
79
+ echo "Google Gemini API client installed successfully"
80
+
81
  # Install Python dependencies
82
  echo "Installing Python dependencies..."
83
  pip install -e .
requirements.txt CHANGED
@@ -16,4 +16,6 @@ tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows sy
16
  # Additional dependencies for image processing
17
  opencv-python-headless>=4.5.0 # Headless version for server environments
18
  pdf2image>=1.16.0 # For PDF processing
19
- dill==0.3.8 # Downgraded to be compatible with datasets
 
 
 
16
  # Additional dependencies for image processing
17
  opencv-python-headless>=4.5.0 # Headless version for server environments
18
  pdf2image>=1.16.0 # For PDF processing
19
+ dill==0.3.8 # Downgraded to be compatible with datasets
20
+ # Gemini API client
21
+ google-genai>=0.1.0
setup.sh CHANGED
@@ -5,6 +5,11 @@ set -e
5
 
6
  echo "Setting up Tesseract OCR environment..."
7
 
 
 
 
 
 
8
  # Create tessdata directory if it doesn't exist
9
  mkdir -p tessdata
10
 
 
5
 
6
  echo "Setting up Tesseract OCR environment..."
7
 
8
+ # Install google-genai package
9
+ echo "Installing Google Gemini API client..."
10
+ pip install -q -U google-genai
11
+ echo "Google Gemini API client installed successfully"
12
+
13
  # Create tessdata directory if it doesn't exist
14
  mkdir -p tessdata
15
 
src/parsers/__init__.py CHANGED
@@ -4,6 +4,7 @@
4
  from src.parsers.docling_parser import DoclingParser
5
  from src.parsers.marker_parser import MarkerParser
6
  from src.parsers.pypdfium_parser import PyPdfiumParser
 
7
 
8
  # You can add new parsers here in the future
9
 
 
4
  from src.parsers.docling_parser import DoclingParser
5
  from src.parsers.marker_parser import MarkerParser
6
  from src.parsers.pypdfium_parser import PyPdfiumParser
7
+ from src.parsers.gemini_flash_parser import GeminiFlashParser
8
 
9
  # You can add new parsers here in the future
10
 
src/parsers/gemini_flash_parser.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, List, Optional, Any, Union
3
+ import os
4
+ import json
5
+ import tempfile
6
+ import base64
7
+ from PIL import Image
8
+ import io
9
+
10
+ from src.parsers.parser_interface import DocumentParser
11
+ from src.parsers.parser_registry import ParserRegistry
12
+
13
+ # Import the Google Gemini API client
14
+ try:
15
+ from google import genai
16
+ from google.genai import types
17
+ GEMINI_AVAILABLE = True
18
+ except ImportError:
19
+ GEMINI_AVAILABLE = False
20
+
21
+
22
+ class GeminiFlashParser(DocumentParser):
23
+ """Parser implementation using Gemini Flash 2.0."""
24
+
25
+ @classmethod
26
+ def get_name(cls) -> str:
27
+ return "Gemini Flash"
28
+
29
+ @classmethod
30
+ def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
31
+ return [
32
+ {
33
+ "id": "none",
34
+ "name": "None",
35
+ "default_params": {}
36
+ }
37
+ ]
38
+
39
+ @classmethod
40
+ def get_description(cls) -> str:
41
+ return "Gemini Flash 2.0 parser for converting documents and images to markdown"
42
+
43
+ def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
44
+ """Parse a document using Gemini Flash 2.0."""
45
+ if not GEMINI_AVAILABLE:
46
+ raise ImportError(
47
+ "The Google Gemini API client is not installed. "
48
+ "Please install it with 'pip install google-genai'."
49
+ )
50
+
51
+ # Get API key from environment variable
52
+ api_key = os.environ.get("GOOGLE_API_KEY")
53
+ if not api_key:
54
+ raise ValueError(
55
+ "GOOGLE_API_KEY environment variable is not set. "
56
+ "Please set it to your Gemini API key."
57
+ )
58
+
59
+ # Initialize the Gemini client
60
+ client = genai.Client(api_key=api_key)
61
+
62
+ # Determine file type based on extension
63
+ file_path = Path(file_path)
64
+ file_extension = file_path.suffix.lower()
65
+
66
+ # Read the file content
67
+ file_content = file_path.read_bytes()
68
+
69
+ # Determine MIME type based on file extension
70
+ mime_type = self._get_mime_type(file_extension)
71
+
72
+ # Create system prompt for document conversion
73
+ system_prompt = (
74
+ "You are an expert document converter that transforms documents into well-formatted markdown. "
75
+ "Preserve the original structure, formatting, and content as accurately as possible. "
76
+ "Include headers, lists, tables, and other formatting elements appropriately in markdown syntax. "
77
+ "Ignore watermarks, page numbers, and other non-content elements."
78
+ )
79
+
80
+ # Create user prompt for document conversion
81
+ user_prompt = "Convert the following document to markdown (.md file) format, preserving its structure and formatting."
82
+
83
+ try:
84
+ # For smaller files (< 20MB), use inline data
85
+ if len(file_content) < 20 * 1024 * 1024: # 20MB
86
+ # Create a Part object from the file content
87
+ file_part = types.Part.from_bytes(data=file_content, mime_type=mime_type)
88
+
89
+ # Generate content with the updated format
90
+ response = client.models.generate_content(
91
+ model="gemini-2.0-flash",
92
+ contents=[
93
+ system_prompt,
94
+ [file_part, user_prompt]
95
+ ]
96
+ )
97
+ else:
98
+ # For larger files, use the File API
99
+ uploaded_file = client.files.upload(
100
+ file=io.BytesIO(file_content),
101
+ config=dict(mime_type=mime_type)
102
+ )
103
+
104
+ # Generate content with the updated format
105
+ response = client.models.generate_content(
106
+ model="gemini-2.0-flash",
107
+ contents=[
108
+ system_prompt,
109
+ [uploaded_file, user_prompt]
110
+ ]
111
+ )
112
+
113
+ # Format the content based on the requested output format
114
+ output_format = kwargs.get("output_format", "markdown")
115
+ content = response.text
116
+
117
+ if output_format.lower() == "json":
118
+ return json.dumps({"content": content}, ensure_ascii=False, indent=2)
119
+ elif output_format.lower() == "text":
120
+ # Remove markdown formatting for plain text
121
+ return content.replace("#", "").replace("*", "").replace("_", "")
122
+ elif output_format.lower() == "document_tags":
123
+ return f"<doc>\n{content}\n</doc>"
124
+ else:
125
+ return content
126
+
127
+ except Exception as e:
128
+ raise Exception(f"Error processing document with Gemini Flash: {str(e)}")
129
+
130
+ def _get_mime_type(self, file_extension: str) -> str:
131
+ """Get the MIME type based on file extension."""
132
+ mime_types = {
133
+ ".pdf": "application/pdf",
134
+ ".txt": "text/plain",
135
+ ".html": "text/html",
136
+ ".htm": "text/html",
137
+ ".xml": "text/xml",
138
+ ".csv": "text/csv",
139
+ ".md": "text/markdown",
140
+ ".rtf": "text/rtf",
141
+ ".js": "application/javascript",
142
+ ".py": "text/x-python",
143
+ ".jpg": "image/jpeg",
144
+ ".jpeg": "image/jpeg",
145
+ ".png": "image/png",
146
+ ".gif": "image/gif",
147
+ ".bmp": "image/bmp",
148
+ ".webp": "image/webp",
149
+ ".tiff": "image/tiff",
150
+ ".tif": "image/tiff",
151
+ # Add support for Office documents
152
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
153
+ ".xls": "application/vnd.ms-excel",
154
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
155
+ ".doc": "application/msword",
156
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
157
+ ".ppt": "application/vnd.ms-powerpoint",
158
+ # Add support for other common document types
159
+ ".json": "application/json",
160
+ ".yaml": "application/x-yaml",
161
+ ".yml": "application/x-yaml",
162
+ ".tex": "application/x-tex",
163
+ ".odt": "application/vnd.oasis.opendocument.text",
164
+ ".ods": "application/vnd.oasis.opendocument.spreadsheet",
165
+ ".odp": "application/vnd.oasis.opendocument.presentation",
166
+ }
167
+
168
+ return mime_types.get(file_extension, "application/pdf") # Default to PDF if unknown
169
+
170
+
171
+ # Register the parser with the registry
172
+ if GEMINI_AVAILABLE:
173
+ ParserRegistry.register(GeminiFlashParser)
174
+ else:
175
+ print("Gemini Flash parser not registered: google-genai package not installed")
tests/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Test suite for the application."""