Spaces:
Running
Running
File size: 3,307 Bytes
f814a11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | """
OpenWolf PDF Parser — Lightweight HF Space
Only does PDF text extraction + chunk splitting.
No LLM, no bge-m3, no heavy ML.
"""
import os, sys, json, urllib.parse, re
from pathlib import Path
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
app = FastAPI(title="OpenWolf PDF Parser")
@app.get("/health")
async def health():
return {"status": "ok"}
@app.post("/parse")
async def parse_pdf(request: Request):
"""
解析 PDF,返回全文文本
Body: { file_path: "inputs/xxx.pdf", repo: "owner/repo", pat: "ghp_xxx" }
"""
body = await request.json()
file_path = body.get("file_path", "")
repo = body.get("repo", os.environ.get("GITHUB_REPO", ""))
pat = body.get("pat", os.environ.get("GITHUB_PAT", ""))
if not file_path:
raise HTTPException(400, "file_path required")
local_path = Path("/app") / file_path
if not local_path.exists():
# Download from GitHub
import requests as req
encoded = '/'.join(urllib.parse.quote(s, safe='') for s in file_path.split('/'))
url = f"https://api.github.com/repos/{repo}/contents/{encoded}"
headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github.raw"} if pat else {"Accept": "application/vnd.github.raw"}
resp = req.get(url, headers=headers, timeout=120)
if resp.status_code != 200:
return JSONResponse({"ok": False, "error": f"download failed HTTP {resp.status_code}"})
local_path.parent.mkdir(parents=True, exist_ok=True)
local_path.write_bytes(resp.content)
# Extract text
import pdfplumber
text = ""
with pdfplumber.open(local_path) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text += t + "\n"
return {"ok": True, "text": text, "chars": len(text)}
@app.post("/chunks")
async def get_chunks(request: Request):
"""
从文本或 PDF 生成稳定 chunk 列表
Body: { file_path: "...", chunk_size: 4500 }
或: { text: "...", chunk_size: 4500 }
"""
body = await request.json()
chunk_size = int(body.get("chunk_size", 4500))
text = body.get("text", "")
if not text:
file_path = body.get("file_path", "")
if file_path:
# Read local file
local_path = Path("/app") / file_path
if local_path.exists():
ext = file_path.rsplit(".", 1)[-1].lower()
if ext == "pdf":
import pdfplumber
with pdfplumber.open(local_path) as pdf:
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
else:
text = local_path.read_text(encoding="utf-8", errors="ignore")
if not text:
return JSONResponse({"ok": False, "error": "no text provided"})
chunks = []
i = 0
n = len(text)
while i < n:
end = min(i + chunk_size, n)
cut = text.rfind("\n\n", i, end)
if cut == -1 or cut <= i:
cut = end
chunk = text[i:cut].strip()
if chunk:
chunks.append(chunk)
i = cut if cut > i else end
return {"ok": True, "chunks": chunks, "total": len(chunks), "total_chars": n}
|