Claude commited on
Commit
d007f0c
·
0 Parent(s):

init: lightweight PDF parser

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +9 -0
  3. README.md +8 -0
  4. app.py +97 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.py linguist-language=Python
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /app
3
+
4
+ RUN pip install --no-cache-dir --timeout 120 fastapi uvicorn requests pdfplumber
5
+
6
+ COPY app.py /app/
7
+
8
+ EXPOSE 7860
9
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: OpenWolf PDF Parser
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ ---
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenWolf PDF Parser — Lightweight HF Space
3
+ Only does PDF text extraction + chunk splitting.
4
+ No LLM, no bge-m3, no heavy ML.
5
+ """
6
+
7
+ import os, sys, json, urllib.parse, re
8
+ from pathlib import Path
9
+ from fastapi import FastAPI, Request, HTTPException
10
+ from fastapi.responses import JSONResponse
11
+
12
+ app = FastAPI(title="OpenWolf PDF Parser")
13
+
14
+ @app.get("/health")
15
+ async def health():
16
+ return {"status": "ok"}
17
+
18
+ @app.post("/parse")
19
+ async def parse_pdf(request: Request):
20
+ """
21
+ 解析 PDF,返回全文文本
22
+ Body: { file_path: "inputs/xxx.pdf", repo: "owner/repo", pat: "ghp_xxx" }
23
+ """
24
+ body = await request.json()
25
+ file_path = body.get("file_path", "")
26
+ repo = body.get("repo", os.environ.get("GITHUB_REPO", ""))
27
+ pat = body.get("pat", os.environ.get("GITHUB_PAT", ""))
28
+
29
+ if not file_path:
30
+ raise HTTPException(400, "file_path required")
31
+
32
+ local_path = Path("/app") / file_path
33
+ if not local_path.exists():
34
+ # Download from GitHub
35
+ import requests as req
36
+ encoded = '/'.join(urllib.parse.quote(s, safe='') for s in file_path.split('/'))
37
+ url = f"https://api.github.com/repos/{repo}/contents/{encoded}"
38
+ headers = {"Authorization": f"token {pat}", "Accept": "application/vnd.github.raw"} if pat else {"Accept": "application/vnd.github.raw"}
39
+ resp = req.get(url, timeout=120)
40
+ if resp.status_code != 200:
41
+ return JSONResponse({"ok": False, "error": f"download failed HTTP {resp.status_code}"})
42
+ local_path.parent.mkdir(parents=True, exist_ok=True)
43
+ local_path.write_bytes(resp.content)
44
+
45
+ # Extract text
46
+ import pdfplumber
47
+ text = ""
48
+ with pdfplumber.open(local_path) as pdf:
49
+ for page in pdf.pages:
50
+ t = page.extract_text()
51
+ if t:
52
+ text += t + "\n"
53
+
54
+ return {"ok": True, "text": text, "chars": len(text)}
55
+
56
+ @app.post("/chunks")
57
+ async def get_chunks(request: Request):
58
+ """
59
+ 从文本或 PDF 生成稳定 chunk 列表
60
+ Body: { file_path: "...", chunk_size: 4500 }
61
+ 或: { text: "...", chunk_size: 4500 }
62
+ """
63
+ body = await request.json()
64
+ chunk_size = int(body.get("chunk_size", 4500))
65
+
66
+ text = body.get("text", "")
67
+ if not text:
68
+ file_path = body.get("file_path", "")
69
+ if file_path:
70
+ # Read local file
71
+ local_path = Path("/app") / file_path
72
+ if local_path.exists():
73
+ ext = file_path.rsplit(".", 1)[-1].lower()
74
+ if ext == "pdf":
75
+ import pdfplumber
76
+ with pdfplumber.open(local_path) as pdf:
77
+ text = "\n".join(page.extract_text() or "" for page in pdf.pages)
78
+ else:
79
+ text = local_path.read_text(encoding="utf-8", errors="ignore")
80
+
81
+ if not text:
82
+ return JSONResponse({"ok": False, "error": "no text provided"})
83
+
84
+ chunks = []
85
+ i = 0
86
+ n = len(text)
87
+ while i < n:
88
+ end = min(i + chunk_size, n)
89
+ cut = text.rfind("\n\n", i, end)
90
+ if cut == -1 or cut <= i:
91
+ cut = end
92
+ chunk = text[i:cut].strip()
93
+ if chunk:
94
+ chunks.append(chunk)
95
+ i = cut if cut > i else end
96
+
97
+ return {"ok": True, "chunks": chunks, "total": len(chunks), "total_chars": n}