GraphRAG / ingest.py
Sanjam19's picture
clean initial commit
a4ab72e
Raw
History Blame Contribute Delete
5.66 kB
import json
import spacy
from datasets import load_dataset
from pathlib import Path
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
print("Loading FinanceBench...")
fb = load_dataset(
"PatronusAI/financebench",
split="train"
)
print("Loading Finance-Alpaca...")
fa = load_dataset(
"gbharti/finance-alpaca",
split="train"
)
print("Loading spaCy...")
nlp = spacy.load("en_core_web_sm")
chunks = []
entities = []
metadata = []
# ─────────────────────────────────────────────
# FinanceBench
# ─────────────────────────────────────────────
for i, row in enumerate(fb):
evidence_list = row["evidence"]
company = row["company"]
doc = row["doc_name"]
period = row["doc_period"]
for j, ev in enumerate(evidence_list):
text = ev.get(
"evidence_text",
""
).strip()
if not text:
continue
chunk_id = f"fb_{i:04d}_{j:02d}"
chunks.append({
"chunk_id": chunk_id,
"company": company,
"doc_name": doc,
"period": period,
"text": text,
"question": row["question"],
"answer": row["answer"],
"source": "financebench"
})
parsed = nlp(text)
ents = list({
(e.text.strip(), e.label_)
for e in parsed.ents
if e.label_ in {
"ORG",
"GPE",
"MONEY",
"DATE",
"PRODUCT",
"PERSON"
}
})
entities.append({
"chunk_id": chunk_id,
"company": company,
"entities": [
{
"text": t,
"label": l
}
for t, l in ents
]
})
metadata.append({
"chunk_id": chunk_id,
"company": company,
"doc_name": doc,
"period": period,
"question": row["question"],
"answer": row["answer"],
"source": "financebench"
})
print(
f"FinanceBench: "
f"{len(chunks)} chunks so far"
)
# ─────────────────────────────────────────────
# Finance-Alpaca
# ─────────────────────────────────────────────
def chunk_text(
text,
size=512,
overlap=50
):
words = text.split()
results = []
start = 0
while start < len(words):
end = min(
start + size,
len(words)
)
results.append(
" ".join(words[start:end])
)
if end == len(words):
break
start += size - overlap
return results
target_tokens = 2_100_000
current_tokens = sum(
len(c["text"].split())
for c in chunks
)
print(
f"Tokens so far: "
f"{current_tokens:,} "
f"/ need 2,100,000"
)
print(
f"Starting alpaca loop, "
f"rows: {len(fa)}"
)
for i, row in enumerate(fa):
if current_tokens >= target_tokens:
print(
f"Target reached "
f"at row {i}"
)
break
# FIXED: use output instead of text
text = row.get(
"output",
""
).strip()
# FIXED: reduced threshold
if len(text.split()) < 20:
continue
for j, chunk_text_str in enumerate(
chunk_text(text)
):
chunk_id = (
f"fa_{i:05d}_{j:02d}"
)
chunks.append({
"chunk_id": chunk_id,
"company": "general",
"doc_name": (
f"alpaca_{i}"
),
"period": "general",
"text": chunk_text_str,
"question": row.get(
"instruction",
""
),
"answer": row.get(
"output",
""
),
"source": "alpaca"
})
entities.append({
"chunk_id": chunk_id,
"company": "general",
"entities": []
})
metadata.append({
"chunk_id": chunk_id,
"company": "general",
"doc_name": (
f"alpaca_{i}"
),
"period": "general",
"question": row.get(
"instruction",
""
),
"answer": row.get(
"output",
""
),
"source": "alpaca"
})
current_tokens += len(
chunk_text_str.split()
)
if i % 1000 == 0:
print(
f"alpaca row {i}, "
f"tokens: "
f"{current_tokens:,}"
)
print(
f"Final: "
f"{len(chunks)} chunks, "
f"{current_tokens:,} tokens"
)
with open(
DATA_DIR / "chunks.json",
"w",
encoding="utf-8"
) as f:
json.dump(
chunks,
f,
indent=2,
ensure_ascii=False
)
with open(
DATA_DIR / "entities.json",
"w",
encoding="utf-8"
) as f:
json.dump(
entities,
f,
indent=2,
ensure_ascii=False
)
with open(
DATA_DIR / "metadata.json",
"w",
encoding="utf-8"
) as f:
json.dump(
metadata,
f,
indent=2,
ensure_ascii=False
)
print("Done.")