Corin1998 commited on
Commit
0881450
·
verified ·
1 Parent(s): 91dc69f

Update rag/ingest.py

Browse files
Files changed (1) hide show
  1. rag/ingest.py +7 -20
rag/ingest.py CHANGED
@@ -1,4 +1,4 @@
1
- import io, os, re, zipfile, requests, datetime
2
  import fitz # PyMuPDF
3
  from tqdm import tqdm
4
  from app.config import settings
@@ -7,9 +7,6 @@ from app.deps import add_to_index
7
  EDINET_LIST = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
8
  EDINET_GET = "https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
9
 
10
- # 公式仕様: 書類一覧APIは ?date=YYYY-MM-DD&type=2&Subscription-Key=APIキー
11
- # 取得APIは /documents/{docID}?type=2(PDFなど)&Subscription-Key=APIキー。:contentReference[oaicite:4]{index=4}
12
-
13
  def list_edinet(date: str):
14
  params = {"date": date, "type": 2, "Subscription-Key": settings.EDINET_API_KEY}
15
  r = requests.get(EDINET_LIST, params=params, timeout=60)
@@ -22,7 +19,6 @@ def download_edinet_pdf(doc_id: str) -> bytes:
22
  url = EDINET_GET.format(doc_id=doc_id)
23
  r = requests.get(url, params=params, timeout=120)
24
  r.raise_for_status()
25
- # 返り値はZIPのこともある。PDFが単体/複数含まれるので展開して結合
26
  content = r.content
27
  try:
28
  with zipfile.ZipFile(io.BytesIO(content)) as zf:
@@ -30,20 +26,18 @@ def download_edinet_pdf(doc_id: str) -> bytes:
30
  for name in zf.namelist():
31
  if name.lower().endswith(".pdf"):
32
  pdf_bytes += zf.read(name)
33
- if not pdf_bytes:
34
- # XBRLやCSVのみ等のケースはスキップ
35
- return b""
36
  return pdf_bytes
37
  except zipfile.BadZipFile:
38
- # 直接PDFが返るケース
39
  return content
40
 
41
  def pdf_to_text(pdf_bytes: bytes) -> str:
42
- text_all = []
 
 
43
  with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
44
  for page in doc:
45
- text_all.append(page.get_text("text"))
46
- return "\n".join(text_all)
47
 
48
  def chunk_text(text: str, max_chars=1000, overlap=150):
49
  text = re.sub(r"\s+", " ", text).strip()
@@ -56,13 +50,12 @@ def chunk_text(text: str, max_chars=1000, overlap=150):
56
  return chunks
57
 
58
  def build_source_url(doc_id: str) -> str:
59
- # APIキー露出を避けるため、自前プロキシURLにする(/proxy/edinet/{doc_id})
60
  return f"/proxy/edinet/{doc_id}?type=pdf"
61
 
62
  def ingest_edinet_for_company(edinet_code: str, date: str):
63
  recs = []
64
  for row in list_edinet(date):
65
- if row.get("edinetCode") != edinet_code:
66
  continue
67
  if row.get("pdfFlag") != "1":
68
  continue
@@ -82,9 +75,3 @@ def ingest_edinet_for_company(edinet_code: str, date: str):
82
  if recs:
83
  add_to_index(recs)
84
  return len(recs)
85
-
86
- # --- TDnet(任意: 有料API or RSSフォールバック) ---
87
- def ingest_tdnet_by_rss(sec_code: str):
88
- # JPX公式TDnet APIは有料。契約がない場合は外部RSS等で代替取得に留める。:contentReference[oaicite:5]{index=5}
89
- # 実装は必要に応じて追加(MVPではEDINET中心で十分)
90
- return 0
 
1
+ import io, os, re, zipfile, requests
2
  import fitz # PyMuPDF
3
  from tqdm import tqdm
4
  from app.config import settings
 
7
  EDINET_LIST = "https://api.edinet-fsa.go.jp/api/v2/documents.json"
8
  EDINET_GET = "https://api.edinet-fsa.go.jp/api/v2/documents/{doc_id}"
9
 
 
 
 
10
  def list_edinet(date: str):
11
  params = {"date": date, "type": 2, "Subscription-Key": settings.EDINET_API_KEY}
12
  r = requests.get(EDINET_LIST, params=params, timeout=60)
 
19
  url = EDINET_GET.format(doc_id=doc_id)
20
  r = requests.get(url, params=params, timeout=120)
21
  r.raise_for_status()
 
22
  content = r.content
23
  try:
24
  with zipfile.ZipFile(io.BytesIO(content)) as zf:
 
26
  for name in zf.namelist():
27
  if name.lower().endswith(".pdf"):
28
  pdf_bytes += zf.read(name)
 
 
 
29
  return pdf_bytes
30
  except zipfile.BadZipFile:
 
31
  return content
32
 
33
  def pdf_to_text(pdf_bytes: bytes) -> str:
34
+ if not pdf_bytes:
35
+ return ""
36
+ texts = []
37
  with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
38
  for page in doc:
39
+ texts.append(page.get_text("text"))
40
+ return "\n".join(texts)
41
 
42
  def chunk_text(text: str, max_chars=1000, overlap=150):
43
  text = re.sub(r"\s+", " ", text).strip()
 
50
  return chunks
51
 
52
  def build_source_url(doc_id: str) -> str:
 
53
  return f"/proxy/edinet/{doc_id}?type=pdf"
54
 
55
  def ingest_edinet_for_company(edinet_code: str, date: str):
56
  recs = []
57
  for row in list_edinet(date):
58
+ if row.get("edinetCode") != edinet_code:
59
  continue
60
  if row.get("pdfFlag") != "1":
61
  continue
 
75
  if recs:
76
  add_to_index(recs)
77
  return len(recs)