Merge branch 'master' into dev
Browse files- .gitignore +1 -0
- marker_server.py +19 -7
- signatures/version1/cla.json +40 -0
.gitignore
CHANGED
|
@@ -12,6 +12,7 @@ debug_data
|
|
| 12 |
temp.md
|
| 13 |
temp
|
| 14 |
conversion_results
|
|
|
|
| 15 |
|
| 16 |
# Byte-compiled / optimized / DLL files
|
| 17 |
__pycache__/
|
|
|
|
| 12 |
temp.md
|
| 13 |
temp
|
| 14 |
conversion_results
|
| 15 |
+
uploads
|
| 16 |
|
| 17 |
# Byte-compiled / optimized / DLL files
|
| 18 |
__pycache__/
|
marker_server.py
CHANGED
|
@@ -18,6 +18,14 @@ from marker.models import create_model_dict
|
|
| 18 |
|
| 19 |
app_data = {}
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
@asynccontextmanager
|
| 22 |
async def lifespan(app: FastAPI):
|
| 23 |
app_data["models"] = create_model_dict()
|
|
@@ -30,10 +38,11 @@ async def lifespan(app: FastAPI):
|
|
| 30 |
|
| 31 |
app = FastAPI(lifespan=lifespan)
|
| 32 |
|
|
|
|
| 33 |
@app.get("/")
|
| 34 |
async def root():
|
| 35 |
return HTMLResponse(
|
| 36 |
-
"""
|
| 37 |
<h1>Marker API</h1>
|
| 38 |
<ul>
|
| 39 |
<li><a href="/docs">API Documentation</a></li>
|
|
@@ -45,8 +54,7 @@ async def root():
|
|
| 45 |
|
| 46 |
class CommonParams(BaseModel):
|
| 47 |
filepath: Annotated[
|
| 48 |
-
str,
|
| 49 |
-
Field(description="The path to the PDF file to convert.")
|
| 50 |
]
|
| 51 |
page_range: Annotated[
|
| 52 |
Optional[str],
|
|
@@ -58,11 +66,15 @@ class CommonParams(BaseModel):
|
|
| 58 |
] = None
|
| 59 |
force_ocr: Annotated[
|
| 60 |
bool,
|
| 61 |
-
Field(
|
|
|
|
|
|
|
| 62 |
] = False
|
| 63 |
paginate_output: Annotated[
|
| 64 |
bool,
|
| 65 |
-
Field(
|
|
|
|
|
|
|
| 66 |
] = False
|
| 67 |
output_format: Annotated[
|
| 68 |
str,
|
|
@@ -106,7 +118,7 @@ async def convert_pdf(
|
|
| 106 |
"output": text,
|
| 107 |
"images": encoded,
|
| 108 |
"metadata": metadata,
|
| 109 |
-
"success": True
|
| 110 |
}
|
| 111 |
|
| 112 |
|
|
@@ -123,4 +135,4 @@ def main(port: int, host: str):
|
|
| 123 |
|
| 124 |
|
| 125 |
if __name__ == "__main__":
|
| 126 |
-
main()
|
|
|
|
| 18 |
|
| 19 |
app_data = {}
|
| 20 |
|
| 21 |
+
|
| 22 |
+
UPLOAD_DIRECTORY = "./uploads" # Directory to store uploaded files
|
| 23 |
+
|
| 24 |
+
# Ensure the upload directory exists
|
| 25 |
+
if not os.path.exists(UPLOAD_DIRECTORY):
|
| 26 |
+
os.makedirs(UPLOAD_DIRECTORY)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
@asynccontextmanager
|
| 30 |
async def lifespan(app: FastAPI):
|
| 31 |
app_data["models"] = create_model_dict()
|
|
|
|
| 38 |
|
| 39 |
app = FastAPI(lifespan=lifespan)
|
| 40 |
|
| 41 |
+
|
| 42 |
@app.get("/")
|
| 43 |
async def root():
|
| 44 |
return HTMLResponse(
|
| 45 |
+
"""
|
| 46 |
<h1>Marker API</h1>
|
| 47 |
<ul>
|
| 48 |
<li><a href="/docs">API Documentation</a></li>
|
|
|
|
| 54 |
|
| 55 |
class CommonParams(BaseModel):
|
| 56 |
filepath: Annotated[
|
| 57 |
+
Optional[str], Field(description="The path to the PDF file to convert.")
|
|
|
|
| 58 |
]
|
| 59 |
page_range: Annotated[
|
| 60 |
Optional[str],
|
|
|
|
| 66 |
] = None
|
| 67 |
force_ocr: Annotated[
|
| 68 |
bool,
|
| 69 |
+
Field(
|
| 70 |
+
description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
|
| 71 |
+
),
|
| 72 |
] = False
|
| 73 |
paginate_output: Annotated[
|
| 74 |
bool,
|
| 75 |
+
Field(
|
| 76 |
+
description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
|
| 77 |
+
),
|
| 78 |
] = False
|
| 79 |
output_format: Annotated[
|
| 80 |
str,
|
|
|
|
| 118 |
"output": text,
|
| 119 |
"images": encoded,
|
| 120 |
"metadata": metadata,
|
| 121 |
+
"success": True,
|
| 122 |
}
|
| 123 |
|
| 124 |
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
if __name__ == "__main__":
|
| 138 |
+
main()
|
signatures/version1/cla.json
CHANGED
|
@@ -47,6 +47,46 @@
|
|
| 47 |
"created_at": "2024-08-18T07:44:04Z",
|
| 48 |
"repoId": 712111618,
|
| 49 |
"pullRequestNo": 257
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
}
|
| 51 |
]
|
| 52 |
}
|
|
|
|
| 47 |
"created_at": "2024-08-18T07:44:04Z",
|
| 48 |
"repoId": 712111618,
|
| 49 |
"pullRequestNo": 257
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"name": "conscienceli",
|
| 53 |
+
"id": 4034943,
|
| 54 |
+
"comment_id": 2333374932,
|
| 55 |
+
"created_at": "2024-09-06T06:56:25Z",
|
| 56 |
+
"repoId": 712111618,
|
| 57 |
+
"pullRequestNo": 276
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "jcytong",
|
| 61 |
+
"id": 156466,
|
| 62 |
+
"comment_id": 2334987739,
|
| 63 |
+
"created_at": "2024-09-07T01:27:51Z",
|
| 64 |
+
"repoId": 712111618,
|
| 65 |
+
"pullRequestNo": 278
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "syldor",
|
| 69 |
+
"id": 4158062,
|
| 70 |
+
"comment_id": 2428114896,
|
| 71 |
+
"created_at": "2024-10-22T03:05:34Z",
|
| 72 |
+
"repoId": 712111618,
|
| 73 |
+
"pullRequestNo": 309
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"name": "iammosespaulr",
|
| 77 |
+
"id": 28682735,
|
| 78 |
+
"comment_id": 2448054789,
|
| 79 |
+
"created_at": "2024-10-30T18:36:24Z",
|
| 80 |
+
"repoId": 712111618,
|
| 81 |
+
"pullRequestNo": 327
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "tjbck",
|
| 85 |
+
"id": 25473318,
|
| 86 |
+
"comment_id": 2454051305,
|
| 87 |
+
"created_at": "2024-11-04T08:09:46Z",
|
| 88 |
+
"repoId": 712111618,
|
| 89 |
+
"pullRequestNo": 332
|
| 90 |
}
|
| 91 |
]
|
| 92 |
}
|