Spaces:
Sleeping
Sleeping
Update rag/ingest.py
Browse files- rag/ingest.py +7 -20
rag/ingest.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import io, os, re, zipfile, requests
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
from tqdm import tqdm
|
| 4 |
from app.config import settings
|
|
@@ -7,9 +7,6 @@ from app.deps import add_to_index
|
|
| 7 |
EDINET_LIST = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
|
| 8 |
EDINET_GET = "https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
|
| 9 |
|
| 10 |
-
# 公式仕様: 書類一覧APIは ?date=YYYY-MM-DD&type=2&Subscription-Key=APIキー
|
| 11 |
-
# 取得APIは /documents/{docID}?type=2(PDFなど)&Subscription-Key=APIキー。:contentReference[oaicite:4]{index=4}
|
| 12 |
-
|
| 13 |
def list_edinet(date: str):
|
| 14 |
params = {"date": date, "type": 2, "Subscription-Key": settings.EDINET_API_KEY}
|
| 15 |
r = requests.get(EDINET_LIST, params=params, timeout=60)
|
|
@@ -22,7 +19,6 @@ def download_edinet_pdf(doc_id: str) -> bytes:
|
|
| 22 |
url = EDINET_GET.format(doc_id=doc_id)
|
| 23 |
r = requests.get(url, params=params, timeout=120)
|
| 24 |
r.raise_for_status()
|
| 25 |
-
# 返り値はZIPのこともある。PDFが単体/複数含まれるので展開して結合
|
| 26 |
content = r.content
|
| 27 |
try:
|
| 28 |
with zipfile.ZipFile(io.BytesIO(content)) as zf:
|
|
@@ -30,20 +26,18 @@ def download_edinet_pdf(doc_id: str) -> bytes:
|
|
| 30 |
for name in zf.namelist():
|
| 31 |
if name.lower().endswith(".pdf"):
|
| 32 |
pdf_bytes += zf.read(name)
|
| 33 |
-
if not pdf_bytes:
|
| 34 |
-
# XBRLやCSVのみ等のケースはスキップ
|
| 35 |
-
return b""
|
| 36 |
return pdf_bytes
|
| 37 |
except zipfile.BadZipFile:
|
| 38 |
-
# 直接PDFが返るケース
|
| 39 |
return content
|
| 40 |
|
| 41 |
def pdf_to_text(pdf_bytes: bytes) -> str:
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
| 44 |
for page in doc:
|
| 45 |
-
|
| 46 |
-
return "\n".join(
|
| 47 |
|
| 48 |
def chunk_text(text: str, max_chars=1000, overlap=150):
|
| 49 |
text = re.sub(r"\s+", " ", text).strip()
|
|
@@ -56,13 +50,12 @@ def chunk_text(text: str, max_chars=1000, overlap=150):
|
|
| 56 |
return chunks
|
| 57 |
|
| 58 |
def build_source_url(doc_id: str) -> str:
|
| 59 |
-
# APIキー露出を避けるため、自前プロキシURLにする(/proxy/edinet/{doc_id})
|
| 60 |
return f"/proxy/edinet/{doc_id}?type=pdf"
|
| 61 |
|
| 62 |
def ingest_edinet_for_company(edinet_code: str, date: str):
|
| 63 |
recs = []
|
| 64 |
for row in list_edinet(date):
|
| 65 |
-
if row.get("edinetCode") != edinet_code:
|
| 66 |
continue
|
| 67 |
if row.get("pdfFlag") != "1":
|
| 68 |
continue
|
|
@@ -82,9 +75,3 @@ def ingest_edinet_for_company(edinet_code: str, date: str):
|
|
| 82 |
if recs:
|
| 83 |
add_to_index(recs)
|
| 84 |
return len(recs)
|
| 85 |
-
|
| 86 |
-
# --- TDnet(任意: 有料API or RSSフォールバック) ---
|
| 87 |
-
def ingest_tdnet_by_rss(sec_code: str):
|
| 88 |
-
# JPX公式TDnet APIは有料。契約がない場合は外部RSS等で代替取得に留める。:contentReference[oaicite:5]{index=5}
|
| 89 |
-
# 実装は必要に応じて追加(MVPではEDINET中心で十分)
|
| 90 |
-
return 0
|
|
|
|
| 1 |
+
import io, os, re, zipfile, requests
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
from tqdm import tqdm
|
| 4 |
from app.config import settings
|
|
|
|
| 7 |
EDINET_LIST = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
|
| 8 |
EDINET_GET = "https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
def list_edinet(date: str):
|
| 11 |
params = {"date": date, "type": 2, "Subscription-Key": settings.EDINET_API_KEY}
|
| 12 |
r = requests.get(EDINET_LIST, params=params, timeout=60)
|
|
|
|
| 19 |
url = EDINET_GET.format(doc_id=doc_id)
|
| 20 |
r = requests.get(url, params=params, timeout=120)
|
| 21 |
r.raise_for_status()
|
|
|
|
| 22 |
content = r.content
|
| 23 |
try:
|
| 24 |
with zipfile.ZipFile(io.BytesIO(content)) as zf:
|
|
|
|
| 26 |
for name in zf.namelist():
|
| 27 |
if name.lower().endswith(".pdf"):
|
| 28 |
pdf_bytes += zf.read(name)
|
|
|
|
|
|
|
|
|
|
| 29 |
return pdf_bytes
|
| 30 |
except zipfile.BadZipFile:
|
|
|
|
| 31 |
return content
|
| 32 |
|
| 33 |
def pdf_to_text(pdf_bytes: bytes) -> str:
|
| 34 |
+
if not pdf_bytes:
|
| 35 |
+
return ""
|
| 36 |
+
texts = []
|
| 37 |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
| 38 |
for page in doc:
|
| 39 |
+
texts.append(page.get_text("text"))
|
| 40 |
+
return "\n".join(texts)
|
| 41 |
|
| 42 |
def chunk_text(text: str, max_chars=1000, overlap=150):
|
| 43 |
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
| 50 |
return chunks
|
| 51 |
|
| 52 |
def build_source_url(doc_id: str) -> str:
|
|
|
|
| 53 |
return f"/proxy/edinet/{doc_id}?type=pdf"
|
| 54 |
|
| 55 |
def ingest_edinet_for_company(edinet_code: str, date: str):
|
| 56 |
recs = []
|
| 57 |
for row in list_edinet(date):
|
| 58 |
+
if row.get("edinetCode") != edinet_code:
|
| 59 |
continue
|
| 60 |
if row.get("pdfFlag") != "1":
|
| 61 |
continue
|
|
|
|
| 75 |
if recs:
|
| 76 |
add_to_index(recs)
|
| 77 |
return len(recs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|