Spaces:

Corin1998
/

IR_PR_PilotPro

Sleeping

App Files Files Community

Corin1998 commited on Sep 17, 2025

Commit

0881450

verified ·

1 Parent(s): 91dc69f

Update rag/ingest.py

Browse files

Files changed (1) hide show

rag/ingest.py +7 -20

rag/ingest.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import io, os, re, zipfile, requests, datetime
 import fitz  # PyMuPDF
 from tqdm import tqdm
 from app.config import settings
@@ -7,9 +7,6 @@ from app.deps import add_to_index
 EDINET_LIST = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
 EDINET_GET  = "https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
-# 公式仕様: 書類一覧APIは ?date=YYYY-MM-DD&type=2&Subscription-Key=APIキー
-# 取得APIは /documents/{docID}?type=2(PDFなど)&Subscription-Key=APIキー。:contentReference[oaicite:4]{index=4}
 def list_edinet(date: str):
     params = {"date": date, "type": 2, "Subscription-Key": settings.EDINET_API_KEY}
     r = requests.get(EDINET_LIST, params=params, timeout=60)
@@ -22,7 +19,6 @@ def download_edinet_pdf(doc_id: str) -> bytes:
     url = EDINET_GET.format(doc_id=doc_id)
     r = requests.get(url, params=params, timeout=120)
     r.raise_for_status()
-    # 返り値はZIPのこともある。PDFが単体/複数含まれるので展開して結合
     content = r.content
     try:
         with zipfile.ZipFile(io.BytesIO(content)) as zf:
@@ -30,20 +26,18 @@ def download_edinet_pdf(doc_id: str) -> bytes:
             for name in zf.namelist():
                 if name.lower().endswith(".pdf"):
                     pdf_bytes += zf.read(name)
-            if not pdf_bytes:
-                # XBRLやCSVのみ等のケースはスキップ
-                return b""
             return pdf_bytes
     except zipfile.BadZipFile:
-        # 直接PDFが返るケース
         return content
 def pdf_to_text(pdf_bytes: bytes) -> str:
-    text_all = []
     with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
         for page in doc:
-            text_all.append(page.get_text("text"))
-    return "\n".join(text_all)
 def chunk_text(text: str, max_chars=1000, overlap=150):
     text = re.sub(r"\s+", " ", text).strip()
@@ -56,13 +50,12 @@ def chunk_text(text: str, max_chars=1000, overlap=150):
     return chunks
 def build_source_url(doc_id: str) -> str:
-    # APIキー露出を避けるため、自前プロキシURLにする（/proxy/edinet/{doc_id}）
     return f"/proxy/edinet/{doc_id}?type=pdf"
 def ingest_edinet_for_company(edinet_code: str, date: str):
     recs = []
     for row in list_edinet(date):
-        if row.get("edinetCode") != edinet_code:
             continue
         if row.get("pdfFlag") != "1":
             continue
@@ -82,9 +75,3 @@ def ingest_edinet_for_company(edinet_code: str, date: str):
     if recs:
         add_to_index(recs)
     return len(recs)
-# --- TDnet（任意: 有料API or RSSフォールバック） ---
-def ingest_tdnet_by_rss(sec_code: str):
-    # JPX公式TDnet APIは有料。契約がない場合は外部RSS等で代替取得に留める。:contentReference[oaicite:5]{index=5}
-    # 実装は必要に応じて追加（MVPではEDINET中心で十分）
-    return 0

+import io, os, re, zipfile, requests
 import fitz  # PyMuPDF
 from tqdm import tqdm
 from app.config import settings
 EDINET_LIST = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
 EDINET_GET  = "https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
 def list_edinet(date: str):
     params = {"date": date, "type": 2, "Subscription-Key": settings.EDINET_API_KEY}
     r = requests.get(EDINET_LIST, params=params, timeout=60)
     url = EDINET_GET.format(doc_id=doc_id)
     r = requests.get(url, params=params, timeout=120)
     r.raise_for_status()
     content = r.content
     try:
         with zipfile.ZipFile(io.BytesIO(content)) as zf:
             for name in zf.namelist():
                 if name.lower().endswith(".pdf"):
                     pdf_bytes += zf.read(name)
             return pdf_bytes
     except zipfile.BadZipFile:
         return content
 def pdf_to_text(pdf_bytes: bytes) -> str:
+    if not pdf_bytes:
+        return ""
+    texts = []
     with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
         for page in doc:
+            texts.append(page.get_text("text"))
+    return "\n".join(texts)
 def chunk_text(text: str, max_chars=1000, overlap=150):
     text = re.sub(r"\s+", " ", text).strip()
     return chunks
 def build_source_url(doc_id: str) -> str:
     return f"/proxy/edinet/{doc_id}?type=pdf"
 def ingest_edinet_for_company(edinet_code: str, date: str):
     recs = []
     for row in list_edinet(date):
+        if row.get("edinetCode") != edinet_code:
             continue
         if row.get("pdfFlag") != "1":
             continue
     if recs:
         add_to_index(recs)
     return len(recs)