File size: 3,602 Bytes
6a80b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from starlette.applications import Starlette
from starlette.responses import JSONResponse, PlainTextResponse
from starlette.routing import Route
from starlette.requests import Request
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
import tempfile
import shutil
import os

import pymupdf4llm
from unstructured.partition.auto import partition
from unstructured.cleaners.core import clean
from chonkie import RecursiveChunker, RecursiveRules

recipe = RecursiveRules.from_dict({   
    "name": "default",
    "schema": "v1",
    "description": "Default recipe for plaintext documents in Korean",
    "language": "kr",
    "metadata": {
        "version": "0.1.0",
        "author": "Chonkie Team"
    },
    "recipe": {
        "delimiters": [".", "ใ€‚", "!", "๏ผ", "?", "๏ผŸ", "\n"],
        "include_delim": "prev",
        "recursive_rules": {
            "levels": 
            [
            {
                "delimiters": [
                    "\n\n", 
                    "\n\r"
                ],
                "whitespace": False,
                "include_delim": "next"
            },
            {
                "delimiters": [
                    "\n",
                    "\r"
                ],
                "whitespace": False,
                "include_delim": "prev"
            },
            {
                "delimiters": [
                    ".",
                    "ใ€‚",
                    "!",
                    "๏ผ",
                    "?",
                    "๏ผŸ"
                ],
                "whitespace": False,
                "include_delim": "prev"
            },
            {
                "delimiters": None,
                "whitespace": True,
                "include_delim": "prev"
            },
            {
                "delimiters": None,
                "whitespace": False,
                "include_delim": "prev"
            }
        ]
    }
    }
})
chunker = RecursiveChunker(rules=recipe)

# ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
async def handle_file_upload(request: Request):
    form = await request.form()
    upload = form.get("file")

    if not upload or not upload.filename:
        return JSONResponse({"error": "ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”."}, status_code=400)

    filename = upload.filename
    ext = os.path.splitext(filename)[1].lower()

    # ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
    with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
        shutil.copyfileobj(upload.file, tmp)
        tmp_path = tmp.name

    try:
        if ext == ".pdf":
            # PDF ์ฒ˜๋ฆฌ
            markdown_text = pymupdf4llm.to_markdown(tmp_path)
            chunks = chunker(markdown_text)
            return JSONResponse([chunk.text for chunk in chunks])
        else:
            # ๋น„-PDF ๋ฌธ์„œ ์ฒ˜๋ฆฌ
            elements = partition(tmp_path)
            cleaned_text = clean(
                "\n".join([str(el) for el in elements]),
                dashes=True,
                trailing_punctuation=True,
            )
            chunks =chunker(cleaned_text)
            return JSONResponse([chunk.text for chunk in chunks])
    except Exception as e:
        return JSONResponse({"error": f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹คํŒจ: {str(e)}"}, status_code=500)
    finally:
        os.unlink(tmp_path)

# ๋ผ์šฐํŒ… ์„ค์ •
routes = [
    Route("/upload", handle_file_upload, methods=["POST"]),
]

# CORS ํ—ˆ์šฉ (์˜ต์…˜)
middleware = [
    Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]),
]

app = Starlette(debug=True, routes=routes, middleware=middleware)