Spaces:

hugh007
/

openwolf-pdf-parser

Sleeping

App Files Files Community

Claude commited on 26 days ago

Commit

d007f0c

0 Parent(s):

init: lightweight PDF parser

Browse files

Files changed (4) hide show

.gitattributes +1 -0
Dockerfile +9 -0
README.md +8 -0
app.py +97 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.py linguist-language=Python

Dockerfile ADDED Viewed

	@@ -0,0 +1,9 @@

+FROM python:3.12-slim
+WORKDIR /app
+RUN pip install --no-cache-dir --timeout 120 fastapi uvicorn requests pdfplumber
+COPY app.py /app/
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+---
+title: OpenWolf PDF Parser
+emoji: 📄
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+OpenWolf PDF Parser — Lightweight HF Space
+Only does PDF text extraction + chunk splitting.
+No LLM, no bge-m3, no heavy ML.
+"""
+import os, sys, json, urllib.parse, re
+from pathlib import Path
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import JSONResponse
+app = FastAPI(title="OpenWolf PDF Parser")
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+@app.post("/parse")
+async def parse_pdf(request: Request):
+    """
+    解析 PDF，返回全文文本
+    Body: { file_path: "inputs/xxx.pdf", repo: "owner/repo", pat: "ghp_xxx" }
+    """
+    body = await request.json()
+    file_path = body.get("file_path", "")
+    repo = body.get("repo", os.environ.get("GITHUB_REPO", ""))
+    pat = body.get("pat", os.environ.get("GITHUB_PAT", ""))
+    if not file_path:
+        raise HTTPException(400, "file_path required")
+    local_path = Path("/app") / file_path
+    if not local_path.exists():
+        # Download from GitHub
+        import requests as req
+        encoded = '/'.join(urllib.parse.quote(s, safe='') for s in file_path.split('/'))
+        url = f"https://api.github.com/repos/{repo}/contents/{encoded}"
+        headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github.raw"} if pat else {"Accept": "application/vnd.github.raw"}
+        resp = req.get(url, timeout=120)
+        if resp.status_code != 200:
+            return JSONResponse({"ok": False, "error": f"download failed HTTP {resp.status_code}"})
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        local_path.write_bytes(resp.content)
+    # Extract text
+    import pdfplumber
+    text = ""
+    with pdfplumber.open(local_path) as pdf:
+        for page in pdf.pages:
+            t = page.extract_text()
+            if t:
+                text += t + "\n"
+    return {"ok": True, "text": text, "chars": len(text)}
+@app.post("/chunks")
+async def get_chunks(request: Request):
+    """
+    从文本或 PDF 生成稳定 chunk 列表
+    Body: { file_path: "...", chunk_size: 4500 }
+    或:   { text: "...", chunk_size: 4500 }
+    """
+    body = await request.json()
+    chunk_size = int(body.get("chunk_size", 4500))
+    text = body.get("text", "")
+    if not text:
+        file_path = body.get("file_path", "")
+        if file_path:
+            # Read local file
+            local_path = Path("/app") / file_path
+            if local_path.exists():
+                ext = file_path.rsplit(".", 1)[-1].lower()
+                if ext == "pdf":
+                    import pdfplumber
+                    with pdfplumber.open(local_path) as pdf:
+                        text = "\n".join(page.extract_text() or "" for page in pdf.pages)
+                else:
+                    text = local_path.read_text(encoding="utf-8", errors="ignore")
+    if not text:
+        return JSONResponse({"ok": False, "error": "no text provided"})
+    chunks = []
+    i = 0
+    n = len(text)
+    while i < n:
+        end = min(i + chunk_size, n)
+        cut = text.rfind("\n\n", i, end)
+        if cut == -1 or cut <= i:
+            cut = end
+        chunk = text[i:cut].strip()
+        if chunk:
+            chunks.append(chunk)
+        i = cut if cut > i else end
+    return {"ok": True, "chunks": chunks, "total": len(chunks), "total_chars": n}