| | from starlette.applications import Starlette |
| | from starlette.responses import JSONResponse, PlainTextResponse |
| | from starlette.routing import Route |
| | from starlette.requests import Request |
| | from starlette.middleware import Middleware |
| | from starlette.middleware.cors import CORSMiddleware |
| | import tempfile |
| | import shutil |
| | import os |
| |
|
| | import pymupdf4llm |
| | from unstructured.partition.auto import partition |
| | from unstructured.cleaners.core import clean |
| | from chonkie import RecursiveChunker, RecursiveRules |
| |
|
| | recipe = RecursiveRules.from_dict({ |
| | "name": "default", |
| | "schema": "v1", |
| | "description": "Default recipe for plaintext documents in Korean", |
| | "language": "kr", |
| | "metadata": { |
| | "version": "0.1.0", |
| | "author": "Chonkie Team" |
| | }, |
| | "recipe": { |
| | "delimiters": [".", "ใ", "!", "๏ผ", "?", "๏ผ", "\n"], |
| | "include_delim": "prev", |
| | "recursive_rules": { |
| | "levels": |
| | [ |
| | { |
| | "delimiters": [ |
| | "\n\n", |
| | "\n\r" |
| | ], |
| | "whitespace": False, |
| | "include_delim": "next" |
| | }, |
| | { |
| | "delimiters": [ |
| | "\n", |
| | "\r" |
| | ], |
| | "whitespace": False, |
| | "include_delim": "prev" |
| | }, |
| | { |
| | "delimiters": [ |
| | ".", |
| | "ใ", |
| | "!", |
| | "๏ผ", |
| | "?", |
| | "๏ผ" |
| | ], |
| | "whitespace": False, |
| | "include_delim": "prev" |
| | }, |
| | { |
| | "delimiters": None, |
| | "whitespace": True, |
| | "include_delim": "prev" |
| | }, |
| | { |
| | "delimiters": None, |
| | "whitespace": False, |
| | "include_delim": "prev" |
| | } |
| | ] |
| | } |
| | } |
| | }) |
| | chunker = RecursiveChunker(rules=recipe) |
| |
|
| | |
| | async def handle_file_upload(request: Request): |
| | form = await request.form() |
| | upload = form.get("file") |
| |
|
| | if not upload or not upload.filename: |
| | return JSONResponse({"error": "ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์."}, status_code=400) |
| |
|
| | filename = upload.filename |
| | ext = os.path.splitext(filename)[1].lower() |
| |
|
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: |
| | shutil.copyfileobj(upload.file, tmp) |
| | tmp_path = tmp.name |
| |
|
| | try: |
| | if ext == ".pdf": |
| | |
| | markdown_text = pymupdf4llm.to_markdown(tmp_path) |
| | chunks = chunker(markdown_text) |
| | return JSONResponse([chunk.text for chunk in chunks]) |
| | else: |
| | |
| | elements = partition(tmp_path) |
| | cleaned_text = clean( |
| | "\n".join([str(el) for el in elements]), |
| | dashes=True, |
| | trailing_punctuation=True, |
| | ) |
| | chunks =chunker(cleaned_text) |
| | return JSONResponse([chunk.text for chunk in chunks]) |
| | except Exception as e: |
| | return JSONResponse({"error": f"๋ฌธ์ ์ฒ๋ฆฌ ์คํจ: {str(e)}"}, status_code=500) |
| | finally: |
| | os.unlink(tmp_path) |
| |
|
| | |
| | routes = [ |
| | Route("/upload", handle_file_upload, methods=["POST"]), |
| | ] |
| |
|
| | |
| | middleware = [ |
| | Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]), |
| | ] |
| |
|
| | app = Starlette(debug=True, routes=routes, middleware=middleware) |