Doramong commited on
Commit
6a80b48
ยท
verified ยท
1 Parent(s): 6f23384

Upload main.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. main.py +122 -0
main.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from starlette.applications import Starlette
2
+ from starlette.responses import JSONResponse, PlainTextResponse
3
+ from starlette.routing import Route
4
+ from starlette.requests import Request
5
+ from starlette.middleware import Middleware
6
+ from starlette.middleware.cors import CORSMiddleware
7
+ import tempfile
8
+ import shutil
9
+ import os
10
+
11
+ import pymupdf4llm
12
+ from unstructured.partition.auto import partition
13
+ from unstructured.cleaners.core import clean
14
+ from chonkie import RecursiveChunker, RecursiveRules
15
+
16
+ recipe = RecursiveRules.from_dict({
17
+ "name": "default",
18
+ "schema": "v1",
19
+ "description": "Default recipe for plaintext documents in Korean",
20
+ "language": "kr",
21
+ "metadata": {
22
+ "version": "0.1.0",
23
+ "author": "Chonkie Team"
24
+ },
25
+ "recipe": {
26
+ "delimiters": [".", "ใ€‚", "!", "๏ผ", "?", "๏ผŸ", "\n"],
27
+ "include_delim": "prev",
28
+ "recursive_rules": {
29
+ "levels":
30
+ [
31
+ {
32
+ "delimiters": [
33
+ "\n\n",
34
+ "\n\r"
35
+ ],
36
+ "whitespace": False,
37
+ "include_delim": "next"
38
+ },
39
+ {
40
+ "delimiters": [
41
+ "\n",
42
+ "\r"
43
+ ],
44
+ "whitespace": False,
45
+ "include_delim": "prev"
46
+ },
47
+ {
48
+ "delimiters": [
49
+ ".",
50
+ "ใ€‚",
51
+ "!",
52
+ "๏ผ",
53
+ "?",
54
+ "๏ผŸ"
55
+ ],
56
+ "whitespace": False,
57
+ "include_delim": "prev"
58
+ },
59
+ {
60
+ "delimiters": None,
61
+ "whitespace": True,
62
+ "include_delim": "prev"
63
+ },
64
+ {
65
+ "delimiters": None,
66
+ "whitespace": False,
67
+ "include_delim": "prev"
68
+ }
69
+ ]
70
+ }
71
+ }
72
+ })
73
+ chunker = RecursiveChunker(rules=recipe)
74
+
75
+ # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
76
+ async def handle_file_upload(request: Request):
77
+ form = await request.form()
78
+ upload = form.get("file")
79
+
80
+ if not upload or not upload.filename:
81
+ return JSONResponse({"error": "ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”."}, status_code=400)
82
+
83
+ filename = upload.filename
84
+ ext = os.path.splitext(filename)[1].lower()
85
+
86
+ # ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
87
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
88
+ shutil.copyfileobj(upload.file, tmp)
89
+ tmp_path = tmp.name
90
+
91
+ try:
92
+ if ext == ".pdf":
93
+ # PDF ์ฒ˜๋ฆฌ
94
+ markdown_text = pymupdf4llm.to_markdown(tmp_path)
95
+ chunks = chunker(markdown_text)
96
+ return JSONResponse([chunk.text for chunk in chunks])
97
+ else:
98
+ # ๋น„-PDF ๋ฌธ์„œ ์ฒ˜๋ฆฌ
99
+ elements = partition(tmp_path)
100
+ cleaned_text = clean(
101
+ "\n".join([str(el) for el in elements]),
102
+ dashes=True,
103
+ trailing_punctuation=True,
104
+ )
105
+ chunks =chunker(cleaned_text)
106
+ return JSONResponse([chunk.text for chunk in chunks])
107
+ except Exception as e:
108
+ return JSONResponse({"error": f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹คํŒจ: {str(e)}"}, status_code=500)
109
+ finally:
110
+ os.unlink(tmp_path)
111
+
112
+ # ๋ผ์šฐํŒ… ์„ค์ •
113
+ routes = [
114
+ Route("/upload", handle_file_upload, methods=["POST"]),
115
+ ]
116
+
117
+ # CORS ํ—ˆ์šฉ (์˜ต์…˜)
118
+ middleware = [
119
+ Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]),
120
+ ]
121
+
122
+ app = Starlette(debug=True, routes=routes, middleware=middleware)