| """ |
| finGraph.py β AI λ΄μ€ μ§μ κ·Έλν λΉλ |
| ===================================== |
| μ€ν μμ: |
| 1. finScrapping.py μ€ν β Articles_*.xlsx μμ± |
| 2. μ΄ νμΌ μ€ν β Neo4jμ μν°ν°/κ΄κ³/λ²‘ν° μ μ¬ |
| |
| λ
Έλ: AICompany, AITechnology, AIService, AIField, Article, Content, Media |
| κ΄κ³: DEVELOPS, INVESTS_IN, PARTNERS_WITH, APPLIES, USED_IN, RELATED_TO, |
| MENTIONS, HAS_CHUNK, PUBLISHED |
| """ |
|
|
| import glob |
| import json |
| import os |
| from typing import Dict, List, TypedDict |
|
|
| import dotenv |
| import neo4j |
| import pandas as pd |
| from langchain_openai import ChatOpenAI |
| from langgraph.graph import END, StateGraph |
| from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings |
| from neo4j_graphrag.indexes import create_vector_index |
| from neo4j_graphrag.llm import OpenAILLM |
|
|
| dotenv.load_dotenv() |
|
|
|
|
| |
| def safe_print(*args, **kwargs) -> None: |
| import sys |
| try: |
| |
| sep = kwargs.get("sep", " ") |
| end = kwargs.get("end", "\n") |
| msg = sep.join(map(str, args)) |
| sys.stdout.write(msg + end) |
| sys.stdout.flush() |
| except UnicodeEncodeError: |
| msg = sep.join(map(str, args)) |
| cleaned = ( |
| msg.replace("β
", "[OK]") |
| .replace("β οΈ", "[WARN]") |
| .replace("π¨", "[ERR]") |
| .replace("βοΈ", "[SKIP]") |
| .replace("π€", "[AI]") |
| .replace("π’", "[COMP]") |
| .replace("π", "[GRAPH]") |
| .replace("π°", "[NEWS]") |
| .replace("π¬", "[TECH]") |
| .replace("π", "[LINK]") |
| ) |
| try: |
| sys.stdout.write(cleaned + end) |
| sys.stdout.flush() |
| except Exception: |
| ascii_msg = msg.encode("ascii", errors="replace").decode("ascii") |
| sys.stdout.write(ascii_msg + end) |
| sys.stdout.flush() |
|
|
|
|
| print = safe_print |
|
|
|
|
| def get_neo4j_driver() -> neo4j.Driver: |
| uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687") |
| client_id = os.getenv("NEO4J_CLIENT_ID") |
| client_secret = os.getenv("NEO4J_CLIENT_SECRET") |
| |
| if client_id and client_secret: |
| try: |
| d = neo4j.GraphDatabase.driver(uri, auth=(client_id, client_secret)) |
| d.verify_connectivity() |
| return d |
| except Exception: |
| pass |
| |
| username = os.getenv("NEO4J_USERNAME", "neo4j") |
| password = os.getenv("NEO4J_PASSWORD", "password") |
| d = neo4j.GraphDatabase.driver(uri, auth=(username, password)) |
| d.verify_connectivity() |
| return d |
|
|
|
|
| driver = None |
|
|
| |
| chat_llm = ChatOpenAI(model="gpt-4o", temperature=0) |
| rag_llm = OpenAILLM(model_name="gpt-4o-mini", model_params={"temperature": 0}) |
| embedder = OpenAIEmbeddings(model="text-embedding-3-small") |
|
|
| INDEX_NAME = "content_vector_index" |
|
|
| |
| |
| |
|
|
|
|
| class ArticleState(TypedDict): |
| article_id: str |
| title: str |
| text: str |
| is_ai_related: bool |
| entities: List[Dict] |
| relations: List[Dict] |
| retry_count: int |
| reflection_feedback: str |
| relation_retry_count: int |
| relation_feedback: str |
|
|
|
|
| def check_ai_relevance(state: ArticleState) -> ArticleState: |
| """Node 1: AI κ΄λ ¨ μ¬λΆ νλ³""" |
| prompt = ( |
| "λ€μ κΈ°μ¬κ° AI(μΈκ³΅μ§λ₯) κΈ°μ Β·κΈ°μ
Β·μλΉμ€μ κ΄λ ¨ μμΌλ©΄ yes, μλλ©΄ noλ‘λ§ λ΅νμΈμ.\n\n" |
| f"{state['text'][:400]}\n\nλ΅λ³(yes/no):" |
| ) |
| res = chat_llm.invoke(prompt) |
| return { |
| **state, |
| "is_ai_related": str(res.content).strip().lower().startswith("yes"), |
| } |
|
|
|
|
| def extract_entities(state: ArticleState) -> ArticleState: |
| """Node 2: μν°ν° μΆμΆ (μκΈ°λ°μ± νΌλλ°± λ°μ λ° νμ
μ ν©μ± κ²μ¦)""" |
| retry_count = state.get("retry_count", 0) + 1 |
| feedback = state.get("reflection_feedback", "") |
| |
| feedback_prompt = "" |
| if feedback: |
| feedback_prompt = ( |
| f"\n\nβ οΈ [μ΄μ μλμ λν κ²μ¦ μ€λ₯ νΌλλ°±]:\n{feedback}\n" |
| "μ μ€λ₯λ₯Ό λ°λμ λΆμνμ¬, μ΄λ²μλ μ€λ³΅λκ±°λ λΉμ΄μκ±°λ λΆμμ νμ§ μκ³ " |
| "μ νν νμ
κ³Ό μ€λͺ
μ κ°μΆ μ¬λ°λ₯Έ μν°ν°λ§ μ격νκ² JSONμΌλ‘ μΆμΆν΄μ£ΌμΈμ." |
| ) |
|
|
| prompt = f"""λ€μ AI λ΄μ€μμ ν΅μ¬ μν°ν°λ€μ μΆμΆνμΈμ. |
| μν°ν° μ ν: |
| - AICompany: κΈ°μ
/κΈ°κ΄ (μ: μΌμ±μ μ, OpenAI) |
| - AITechnology: AI κΈ°μ (μ: λκ·λͺ¨μΈμ΄λͺ¨λΈ, κ°ννμ΅) |
| - AIService: μλΉμ€/μ ν (μ: ChatGPT, HyperCLOVA X) |
| - AIField: μ μ© λΆμΌ (μ: κΈμ΅AI, AI λ°λ체) |
| |
| μ λͺ©: {state["title"]} |
| λ³Έλ¬Έ: {state["text"][:900]}{feedback_prompt} |
| |
| JSONμΌλ‘λ§ μλ΅: {{"entities":[{{"name":"...","type":"AICompany|AITechnology|AIService|AIField","description":"..."}}]}}""" |
| |
| res = chat_llm.invoke(prompt) |
| entities = [] |
| new_feedback = "" |
| |
| try: |
| raw = str(res.content).strip() |
| if "```" in raw: |
| raw = raw.split("```")[1].lstrip("json") |
| data = json.loads(raw) |
| extracted = data.get("entities", []) |
| |
| allowed_types = {"AICompany", "AITechnology", "AIService", "AIField"} |
| valid_entities = [] |
| for e in extracted: |
| name = e.get("name", "").strip() |
| etype = e.get("type", "").strip() |
| desc = e.get("description", "").strip() |
| |
| if not name: |
| new_feedback += "- μν°ν°μ μ΄λ¦(name) νλκ° λλ½λμκ±°λ λΉμ΄μμ΅λλ€.\n" |
| continue |
| if etype not in allowed_types: |
| new_feedback += f"- μν°ν° '{name}'μ νμ
'{etype}'μ νμ©λ μ’
λ₯({', '.join(allowed_types)})κ° μλλλ€.\n" |
| continue |
| if not desc: |
| new_feedback += f"- μν°ν° '{name}'μ λν μ€λͺ
(description)μ΄ μλ΅λμμ΅λλ€.\n" |
| continue |
| |
| valid_entities.append({ |
| "name": name, |
| "type": etype, |
| "description": desc |
| }) |
| |
| entities = valid_entities |
| if not entities: |
| new_feedback = "μ ν¨ν μν°ν°κ° νλλ μΆμΆλμ§ μμμ΅λλ€." |
| |
| except Exception as err: |
| entities = [] |
| new_feedback = f"μλ΅ JSON νμ± μ€ν¨ λλ νμμ΄ μ¬λ°λ₯΄μ§ μμ΅λλ€. μλ¬: {str(err)}" |
| |
| return { |
| **state, |
| "entities": entities, |
| "retry_count": retry_count, |
| "reflection_feedback": new_feedback.strip() |
| } |
|
|
|
|
| def extract_relations(state: ArticleState) -> ArticleState: |
| """Node 3: κ΄κ³ μΆμΆ (μκΈ°λ°μ± νΌλλ°± λ°μ λ° μν°ν°λͺ
μ ν©μ± κ²μ¦)""" |
| if not state["entities"]: |
| return {**state, "relations": [], "relation_retry_count": 0, "relation_feedback": ""} |
|
|
| relation_retry = state.get("relation_retry_count", 0) + 1 |
| rel_feedback = state.get("relation_feedback", "") |
|
|
| |
| names_list = [e["name"] for e in state["entities"]] |
| elist = "\n".join([f"- {e['name']} ({e['type']})" for e in state["entities"]]) |
|
|
| feedback_prompt = "" |
| if rel_feedback: |
| feedback_prompt = ( |
| f"\n\nβ οΈ [μ΄μ μλ κ΄κ³ μΆμΆ μ€λ₯ νΌλλ°±]:\n{rel_feedback}\n" |
| "μ μ€λ₯λ₯Ό λ°λμ μμ νμ¬, source/target μ΄λ¦μ΄ μν°ν° λͺ©λ‘μ μλ μ΄λ¦κ³Ό μ νν μΌμΉνλ " |
| "κ΄κ³λ§ JSONμΌλ‘ μλ΅νμΈμ." |
| ) |
|
|
| prompt = ( |
| f"λ€μ AI λ΄μ€μμ μν°ν° κ°μ κ΄κ³λ₯Ό μΆμΆνμΈμ.\n\n" |
| f"μν°ν° λͺ©λ‘ (μ΄λ¦μ μ νν μ΄ λͺ©λ‘μμλ§ μ¬μ©):\n{elist}\n\n" |
| f"λ³Έλ¬Έ: {state['text'][:900]}\n\n" |
| "κ΄κ³ μ ν:\n" |
| "- DEVELOPS: κΈ°μ
μ΄ κΈ°μ /μλΉμ€λ₯Ό κ°λ°\n" |
| "- INVESTS_IN: κΈ°μ
μ΄ λ€λ₯Έ κΈ°μ
/λΆμΌμ ν¬μ\n" |
| "- PARTNERS_WITH: κΈ°μ
κ° ννΈλμ/νλ ₯\n" |
| "- APPLIES: κΈ°μ
μ΄ κΈ°μ μ νΉμ λΆμΌμ μ μ©\n" |
| "- USED_IN: κΈ°μ /μλΉμ€κ° νΉμ λΆμΌ/μ νμ νμ©\n" |
| "- RELATED_TO: μΌλ°μ μ°κ΄ κ΄κ³\n\n" |
| "κ·μΉ: sourceμ targetμ λ°λμ μ μν°ν° λͺ©λ‘μ μ νν μ΄λ¦μ μ¬μ©ν κ². " |
| "μν°ν°κ° μ΅μ 2κ° μ΄μμ΄λ©΄ λ°λμ 1κ° μ΄μμ κ΄κ³λ₯Ό μΆμΆν κ².\n\n" |
| f"{feedback_prompt}" |
| 'JSONμΌλ‘λ§ μλ΅: {"relations":[{"source":"μν°ν°λͺ
","relation":"κ΄κ³μ ν","target":"μν°ν°λͺ
"}]}' |
| ) |
|
|
| res = chat_llm.invoke(prompt) |
| relations: List[Dict] = [] |
| new_rel_feedback = "" |
|
|
| try: |
| raw = str(res.content).strip() |
| if "```" in raw: |
| raw = raw.split("```")[1].lstrip("json").strip() |
| parsed = json.loads(raw).get("relations", []) |
|
|
| |
| names_set = set(names_list) |
| allowed = {"DEVELOPS", "INVESTS_IN", "PARTNERS_WITH", "APPLIES", "USED_IN", "RELATED_TO"} |
| valid_rels: List[Dict] = [] |
| for r in parsed: |
| src = r.get("source", "").strip() |
| tgt = r.get("target", "").strip() |
| rel = r.get("relation", "").strip().upper() |
| if src not in names_set: |
| new_rel_feedback += f"- source '{src}'μ΄ μν°ν° λͺ©λ‘μ μμ\n" |
| continue |
| if tgt not in names_set: |
| new_rel_feedback += f"- target '{tgt}'μ΄ μν°ν° λͺ©λ‘μ μμ\n" |
| continue |
| if rel not in allowed: |
| new_rel_feedback += f"- κ΄κ³μ ν '{rel}'μ νμ©λμ§ μμ\n" |
| continue |
| if src == tgt: |
| new_rel_feedback += f"- sourceμ targetμ΄ λμΌ({src})νμ¬ μ μΈ\n" |
| continue |
| valid_rels.append({"source": src, "relation": rel, "target": tgt}) |
|
|
| relations = valid_rels |
| |
| if len(names_list) >= 2 and not relations: |
| new_rel_feedback = ( |
| f"μν°ν°κ° {len(names_list)}κ°μμλ μ ν¨ κ΄κ³κ° 0κ°μ
λλ€. " |
| "λ³Έλ¬Έμμ λ°λμ μ°κ΄λλ μν°ν° μμ μ°Ύμ κ΄κ³λ₯Ό μΆμΆνμΈμ." |
| ) |
| except Exception as err: |
| relations = [] |
| new_rel_feedback = f"JSON νμ± μ€ν¨: {str(err)}" |
|
|
| return { |
| **state, |
| "relations": relations, |
| "relation_retry_count": relation_retry, |
| "relation_feedback": new_rel_feedback.strip(), |
| } |
|
|
|
|
| def route_after_check(state: ArticleState) -> str: |
| """AI κ΄λ ¨ κΈ°μ¬μΈμ§ νλ³ ν λΌμ°ν
""" |
| return "extract_entities" if state["is_ai_related"] else END |
|
|
|
|
| def validate_entities(state: ArticleState) -> str: |
| """μν°ν° νμ§ κ²μ¦ β λ―Έλ¬ μ μ΅λ 3ν μκΈ°λ°μ±(Self-Reflection) 루ν""" |
| retry_count = state.get("retry_count", 0) |
| feedback = state.get("reflection_feedback", "") |
| entities = state.get("entities", []) |
|
|
| if (feedback or not entities) and retry_count < 3: |
| print(f" β οΈ [μν°ν° Self-Reflection] νμ§ λ―Έλ¬ ({retry_count}/3). νΌλλ°±: {feedback[:80]}") |
| return "extract_entities" |
|
|
| if feedback and retry_count >= 3: |
| print(f" π¨ [μν°ν° Self-Reflection] 3ν μ΄κ³Ό, κ°μ ν΅κ³Ό. νΌλλ°±: {feedback[:80]}") |
|
|
| return "extract_relations" |
|
|
|
|
| def validate_relations(state: ArticleState) -> str: |
| """κ΄κ³ νμ§ κ²μ¦ β μν°ν° 2κ° μ΄μμΈλ° κ΄κ³ 0κ°μ΄λ©΄ μ΅λ 2ν μ¬μλ""" |
| rel_retry = state.get("relation_retry_count", 0) |
| rel_feedback = state.get("relation_feedback", "") |
| relations = state.get("relations", []) |
| entities = state.get("entities", []) |
|
|
| |
| if len(entities) >= 2 and not relations and rel_retry < 2: |
| print(f" β οΈ [κ΄κ³ Self-Reflection] κ΄κ³ 0κ° ({rel_retry}/2). μ¬μλ: {rel_feedback[:80]}") |
| return "extract_relations" |
|
|
| if rel_feedback and relations: |
| |
| print(f" β οΈ [κ΄κ³ Self-Reflection] μΌλΆ λ¬΄ν¨ κ΄κ³ μ μΈλ¨. μ ν¨ κ΄κ³: {len(relations)}κ°") |
|
|
| return END |
|
|
|
|
| builder = StateGraph(ArticleState) |
| builder.add_node("check_ai", check_ai_relevance) |
| builder.add_node("extract_entities", extract_entities) |
| builder.add_node("extract_relations", extract_relations) |
| builder.set_entry_point("check_ai") |
| builder.add_conditional_edges("check_ai", route_after_check) |
|
|
| |
| builder.add_conditional_edges( |
| "extract_entities", |
| validate_entities, |
| { |
| "extract_entities": "extract_entities", |
| "extract_relations": "extract_relations", |
| }, |
| ) |
|
|
| |
| builder.add_conditional_edges( |
| "extract_relations", |
| validate_relations, |
| { |
| "extract_relations": "extract_relations", |
| END: END, |
| }, |
| ) |
|
|
| pipeline = builder.compile() |
|
|
|
|
| |
| |
| |
|
|
| ENTITY_TYPE_MAP = { |
| "AICompany": "AICompany", |
| "AITechnology": "AITechnology", |
| "AIService": "AIService", |
| "AIField": "AIField", |
| } |
|
|
|
|
| def setup_schema(tx) -> None: |
| constraints = [ |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:AICompany) REQUIRE n.name IS UNIQUE", |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:AITechnology) REQUIRE n.name IS UNIQUE", |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:AIService) REQUIRE n.name IS UNIQUE", |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:AIField) REQUIRE n.name IS UNIQUE", |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Article) REQUIRE n.article_id IS UNIQUE", |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Content) REQUIRE n.content_id IS UNIQUE", |
| "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Media) REQUIRE n.name IS UNIQUE", |
| ] |
| for c in constraints: |
| try: |
| tx.run(c) |
| except Exception: |
| pass |
|
|
|
|
| def upsert_entity(tx, e: Dict) -> None: |
| ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany") |
| tx.run( |
| f"MERGE (n:{ntype} {{name:$name}}) " |
| "ON CREATE SET n.description=$desc " |
| "ON MATCH SET n.description=COALESCE(n.description,$desc)", |
| name=e["name"], |
| desc=e.get("description", ""), |
| ) |
|
|
|
|
| def upsert_relation(tx, r: Dict) -> None: |
| rel = r.get("relation", "RELATED_TO").upper().replace(" ", "_") |
| allowed = { |
| "DEVELOPS", |
| "INVESTS_IN", |
| "PARTNERS_WITH", |
| "APPLIES", |
| "USED_IN", |
| "RELATED_TO", |
| } |
| if rel not in allowed: |
| return |
| try: |
| tx.run( |
| f"MATCH (s {{name:$src}}) MATCH (t {{name:$tgt}}) MERGE (s)-[:{rel}]->(t)", |
| src=r["source"], |
| tgt=r["target"], |
| ) |
| except Exception: |
| pass |
|
|
|
|
| def upsert_article_and_mentions(tx, row: pd.Series, entities: List[Dict]) -> None: |
| tx.run( |
| "MERGE (a:Article {article_id:$aid}) SET a.title=$title, a.url=$url, a.published_date=$date", |
| aid=row.get("article_id", ""), |
| title=row.get("title", ""), |
| url=row.get("url", ""), |
| date=str(row.get("published_date", "")), |
| ) |
| if pd.notna(row.get("source", "")): |
| tx.run( |
| "MERGE (m:Media {name:$src}) WITH m MATCH (a:Article {article_id:$aid}) MERGE (m)-[:PUBLISHED]->(a)", |
| src=row["source"], |
| aid=row.get("article_id", ""), |
| ) |
| for e in entities: |
| ntype = ENTITY_TYPE_MAP.get(e.get("type", "AICompany"), "AICompany") |
| try: |
| tx.run( |
| f"MATCH (a:Article {{article_id:$aid}}) MATCH (n:{ntype} {{name:$name}}) MERGE (a)-[:MENTIONS]->(n)", |
| aid=row.get("article_id", ""), |
| name=e["name"], |
| ) |
| except Exception: |
| pass |
|
|
|
|
| def chunk_text(text: str, size: int = 500, overlap: int = 50) -> List[str]: |
| if not text or pd.isna(text): |
| return [] |
| text = str(text) |
| return [text[i : i + size].strip() for i in range(0, len(text), size - overlap) if text[i : i + size].strip()] |
|
|
|
|
| |
| |
| |
|
|
|
|
| def is_article_loaded(tx, aid: str) -> bool: |
| """μ΄λ―Έ DBμ μ μ¬λ κΈ°μ¬μΈμ§ 체ν¬νμ¬ μ€λ³΅ API νΈμΆ λ°©μ§""" |
| res = tx.run("MATCH (a:Article {article_id:$aid}) RETURN count(a) as cnt", aid=aid) |
| single = res.single() |
| return (single["cnt"] > 0) if single else False |
|
|
|
|
| |
| |
| |
|
|
|
|
| def main() -> None: |
| global driver |
| driver = get_neo4j_driver() |
| |
| |
| xlsx_files = sorted(glob.glob("Articles_*.xlsx") + glob.glob(os.path.join("src", "graphBuilder", "scrapping", "Articles_*.xlsx"))) |
| if not xlsx_files: |
| raise FileNotFoundError("Articles_*.xlsx νμΌμ΄ μμ΅λλ€. finScrapping.pyλ₯Ό λ¨Όμ μ€ννμΈμ.") |
| |
| dfs = [] |
| for f in xlsx_files: |
| try: |
| dfs.append(pd.read_excel(f)) |
| except Exception as e: |
| print(f"β οΈ {f} λ‘λ μ€ν¨: {e}") |
|
|
| df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=["url"]) |
| print(f"β
λ‘λ μλ£: μ΄ {len(xlsx_files)}κ° μμ
νμΌ ν΅ν© μλ£ ({len(df)}건μ κ³ μ κΈ°μ¬ λμ)") |
|
|
| |
| with driver.session() as s: |
| s.execute_write(setup_schema) |
| print("β
Neo4j μ€ν€λ§ μ€λΉ μλ£ (κΈ°μ‘΄ λ°μ΄ν° 보쑴)") |
|
|
| |
| print(f"μ΄ {len(df)}건 μ€ μ κ· κΈ°μ¬ νν°λ§ λ° μ²λ¦¬ μμ...") |
| for idx, row in df.iterrows(): |
| aid = str(row.get("article_id", f"ART_{idx}")) |
| title = str(row.get("title", "")) |
| |
| |
| with driver.session() as s: |
| exists = s.execute_read(is_article_loaded, aid) |
| |
| if exists: |
| print(f" βοΈ [{idx + 1}/{len(df)}] μ΄λ―Έ μ μ¬λ¨ (μ€ν΅): {title[:35]}...") |
| continue |
|
|
| text = title + "\n" + str(row.get("content", "")) |
| state: ArticleState = dict( |
| article_id=aid, |
| title=title, |
| text=text, |
| is_ai_related=False, |
| entities=[], |
| relations=[], |
| retry_count=0, |
| reflection_feedback="", |
| relation_retry_count=0, |
| relation_feedback="", |
| ) |
| out = pipeline.invoke(state) |
| if out["is_ai_related"]: |
| with driver.session() as s: |
| for entity in out["entities"]: |
| s.execute_write(upsert_entity, entity) |
| for r in out["relations"]: |
| s.execute_write(upsert_relation, r) |
| s.execute_write(upsert_article_and_mentions, row, out["entities"]) |
| rel_cnt = len(out["relations"]) |
| ent_cnt = len(out["entities"]) |
| |
| rel_warn = " β οΈ κ΄κ³=0" if ent_cnt >= 2 and rel_cnt == 0 else "" |
| print( |
| f" β
[{idx + 1}/{len(df)}] μ κ· μ μ¬μλ£: {title[:35]}... " |
| f"| μν°ν°: {ent_cnt}κ° | κ΄κ³: {rel_cnt}κ°{rel_warn}" |
| ) |
| else: |
| print(f" βοΈ [{idx + 1}/{len(df)}] AI λΉκ΄λ ¨ (μ μ¬ μ μΈ): {title[:35]}...") |
| |
| print("\nβ
μν°ν°/κ΄κ³ μΆμΆ λ° Neo4j μ¦λΆ μ μ¬ μλ£") |
|
|
| |
| print("Content λ
Έλ μμ± λ° μ κ· μλ² λ© μμ...") |
| for idx, row in df.iterrows(): |
| aid = str(row.get("article_id", f"ART_{idx}")) |
| |
| |
| with driver.session() as s: |
| res = s.run("MATCH (a:Article {article_id:$aid})-[:HAS_CHUNK]->(c:Content) RETURN count(c) as cnt", aid=aid) |
| single = res.single() |
| has_chunks = (single["cnt"] > 0) if single else False |
| |
| if has_chunks: |
| continue |
|
|
| chunks = chunk_text(str(row.get("content", ""))) |
| with driver.session() as s: |
| for i, chunk in enumerate(chunks): |
| cid = f"{aid}_chunk_{i}" |
| vec = embedder.embed_query(chunk) |
| s.run( |
| "MERGE (c:Content {content_id:$cid}) " |
| "SET c.chunk=$chunk, c.article_id=$aid, c.chunk_index=$i, c.embedding=$vec " |
| "WITH c MATCH (a:Article {article_id:$aid}) MERGE (a)-[:HAS_CHUNK]->(c)", |
| cid=cid, |
| chunk=chunk, |
| aid=aid, |
| i=i, |
| vec=vec, |
| ) |
| print("β
Content λ
Έλ μ κ· μλ² λ© μ μ¬ μλ£") |
|
|
| |
| create_vector_index( |
| driver, |
| INDEX_NAME, |
| label="Content", |
| embedding_property="embedding", |
| dimensions=1536, |
| similarity_fn="cosine", |
| ) |
| print(f"β
λ²‘ν° μΈλ±μ€ [{INDEX_NAME}] κ°±μ λ° κ²μ¦ μλ£") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|