Vik Paruchuri commited on
Commit
77842ad
·
2 Parent(s): 002d7b0 fcfa501

Merge branch 'master' into dev

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. marker_server.py +19 -7
  3. signatures/version1/cla.json +40 -0
.gitignore CHANGED
@@ -12,6 +12,7 @@ debug_data
12
  temp.md
13
  temp
14
  conversion_results
 
15
 
16
  # Byte-compiled / optimized / DLL files
17
  __pycache__/
 
12
  temp.md
13
  temp
14
  conversion_results
15
+ uploads
16
 
17
  # Byte-compiled / optimized / DLL files
18
  __pycache__/
marker_server.py CHANGED
@@ -18,6 +18,14 @@ from marker.models import create_model_dict
18
 
19
  app_data = {}
20
 
 
 
 
 
 
 
 
 
21
  @asynccontextmanager
22
  async def lifespan(app: FastAPI):
23
  app_data["models"] = create_model_dict()
@@ -30,10 +38,11 @@ async def lifespan(app: FastAPI):
30
 
31
  app = FastAPI(lifespan=lifespan)
32
 
 
33
  @app.get("/")
34
  async def root():
35
  return HTMLResponse(
36
- """
37
  <h1>Marker API</h1>
38
  <ul>
39
  <li><a href="/docs">API Documentation</a></li>
@@ -45,8 +54,7 @@ async def root():
45
 
46
  class CommonParams(BaseModel):
47
  filepath: Annotated[
48
- str,
49
- Field(description="The path to the PDF file to convert.")
50
  ]
51
  page_range: Annotated[
52
  Optional[str],
@@ -58,11 +66,15 @@ class CommonParams(BaseModel):
58
  ] = None
59
  force_ocr: Annotated[
60
  bool,
61
- Field(description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases).")
 
 
62
  ] = False
63
  paginate_output: Annotated[
64
  bool,
65
- Field(description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines).")
 
 
66
  ] = False
67
  output_format: Annotated[
68
  str,
@@ -106,7 +118,7 @@ async def convert_pdf(
106
  "output": text,
107
  "images": encoded,
108
  "metadata": metadata,
109
- "success": True
110
  }
111
 
112
 
@@ -123,4 +135,4 @@ def main(port: int, host: str):
123
 
124
 
125
  if __name__ == "__main__":
126
- main()
 
18
 
19
  app_data = {}
20
 
21
+
22
+ UPLOAD_DIRECTORY = "./uploads" # Directory to store uploaded files
23
+
24
+ # Ensure the upload directory exists
25
+ if not os.path.exists(UPLOAD_DIRECTORY):
26
+ os.makedirs(UPLOAD_DIRECTORY)
27
+
28
+
29
  @asynccontextmanager
30
  async def lifespan(app: FastAPI):
31
  app_data["models"] = create_model_dict()
 
38
 
39
  app = FastAPI(lifespan=lifespan)
40
 
41
+
42
  @app.get("/")
43
  async def root():
44
  return HTMLResponse(
45
+ """
46
  <h1>Marker API</h1>
47
  <ul>
48
  <li><a href="/docs">API Documentation</a></li>
 
54
 
55
  class CommonParams(BaseModel):
56
  filepath: Annotated[
57
+ Optional[str], Field(description="The path to the PDF file to convert.")
 
58
  ]
59
  page_range: Annotated[
60
  Optional[str],
 
66
  ] = None
67
  force_ocr: Annotated[
68
  bool,
69
+ Field(
70
+ description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
71
+ ),
72
  ] = False
73
  paginate_output: Annotated[
74
  bool,
75
+ Field(
76
+ description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
77
+ ),
78
  ] = False
79
  output_format: Annotated[
80
  str,
 
118
  "output": text,
119
  "images": encoded,
120
  "metadata": metadata,
121
+ "success": True,
122
  }
123
 
124
 
 
135
 
136
 
137
  if __name__ == "__main__":
138
+ main()
signatures/version1/cla.json CHANGED
@@ -47,6 +47,46 @@
47
  "created_at": "2024-08-18T07:44:04Z",
48
  "repoId": 712111618,
49
  "pullRequestNo": 257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
  ]
52
  }
 
47
  "created_at": "2024-08-18T07:44:04Z",
48
  "repoId": 712111618,
49
  "pullRequestNo": 257
50
+ },
51
+ {
52
+ "name": "conscienceli",
53
+ "id": 4034943,
54
+ "comment_id": 2333374932,
55
+ "created_at": "2024-09-06T06:56:25Z",
56
+ "repoId": 712111618,
57
+ "pullRequestNo": 276
58
+ },
59
+ {
60
+ "name": "jcytong",
61
+ "id": 156466,
62
+ "comment_id": 2334987739,
63
+ "created_at": "2024-09-07T01:27:51Z",
64
+ "repoId": 712111618,
65
+ "pullRequestNo": 278
66
+ },
67
+ {
68
+ "name": "syldor",
69
+ "id": 4158062,
70
+ "comment_id": 2428114896,
71
+ "created_at": "2024-10-22T03:05:34Z",
72
+ "repoId": 712111618,
73
+ "pullRequestNo": 309
74
+ },
75
+ {
76
+ "name": "iammosespaulr",
77
+ "id": 28682735,
78
+ "comment_id": 2448054789,
79
+ "created_at": "2024-10-30T18:36:24Z",
80
+ "repoId": 712111618,
81
+ "pullRequestNo": 327
82
+ },
83
+ {
84
+ "name": "tjbck",
85
+ "id": 25473318,
86
+ "comment_id": 2454051305,
87
+ "created_at": "2024-11-04T08:09:46Z",
88
+ "repoId": 712111618,
89
+ "pullRequestNo": 332
90
  }
91
  ]
92
  }