import subprocess, sys def _install(pkg): subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", pkg]) try: from neo4j import GraphDatabase except ImportError: _install("neo4j==6.1.0") from neo4j import GraphDatabase try: from groq import Groq except ImportError: _install("groq==1.1.1") from groq import Groq try: from pyvis.network import Network except ImportError: _install("pyvis==0.3.2") from pyvis.network import Network try: import plotly.graph_objects as go except ImportError: _install("plotly==5.24.1") import plotly.graph_objects as go try: import pandas as pd except ImportError: _install("pandas==2.2.3") import pandas as pd import streamlit as st import os import time import json import re import tempfile # ───────────────────────────────────────────── # PAGE CONFIG # ───────────────────────────────────────────── st.set_page_config( page_title="GraphRAG vs Vector RAG | Daniel Fonseca", page_icon="🕸️", layout="wide", initial_sidebar_state="expanded", ) # ───────────────────────────────────────────── # CUSTOM CSS # ───────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ───────────────────────────────────────────── # NEO4J CONNECTION # ───────────────────────────────────────────── def _get_neo4j_creds(): uri = (st.secrets.get("NEO4J_URI") or os.getenv("NEO4J_URI") or "neo4j+s://e3b8c8ec.databases.neo4j.io") user = (st.secrets.get("NEO4J_USERNAME") or os.getenv("NEO4J_USERNAME") or "e3b8c8ec") password = (st.secrets.get("NEO4J_PASSWORD") or os.getenv("NEO4J_PASSWORD") or "") database = (st.secrets.get("NEO4J_DATABASE") or os.getenv("NEO4J_DATABASE") or "e3b8c8ec") return ( uri.strip().strip('"').strip("'"), user.strip().strip('"').strip("'"), password.strip().strip('"').strip("'"), database.strip().strip('"').strip("'"), ) @st.cache_resource def get_neo4j_driver(): uri, user, password, database = _get_neo4j_creds() if not password: st.sidebar.warning("NEO4J_PASSWORD not set in Secrets.") return None, None try: driver = GraphDatabase.driver( uri, auth=(user, password), max_connection_lifetime=200, keep_alive=True, ) driver.verify_connectivity() return driver, database except Exception as e: st.sidebar.error(f"Neo4j error: {e}") return None, None def get_session(driver, database): """Always get a fresh session, reconnecting if needed.""" try: return driver.session(database=database) except Exception: # Driver is defunct — clear cache and rebuild get_neo4j_driver.clear() uri, user, password, database = _get_neo4j_creds() new_driver = GraphDatabase.driver( uri, auth=(user, password), max_connection_lifetime=200, keep_alive=True, ) return new_driver.session(database=database) @st.cache_resource def get_groq_client(): api_key = st.secrets.get("GROQ_API_KEY", os.getenv("GROQ_API_KEY", "")) if not api_key: return None return Groq(api_key=api_key) # ───────────────────────────────────────────── # SEED DATABASE WITH FRAUD GRAPH # ───────────────────────────────────────────── SEED_CYPHER = """ // Clear existing MATCH (n) DETACH DELETE n; // Customers MERGE (c1:Customer {id:'C-001', name:'Carlos Mendez', risk:'high'}) MERGE (c2:Customer {id:'C-002', name:'Ana Paula', risk:'medium'}) MERGE (c3:Customer {id:'C-003', name:'Roberto Silva', risk:'high'}) MERGE (c4:Customer {id:'C-004', name:'Maria Costa', risk:'low'}) MERGE (c5:Customer {id:'C-005', name:'João Lima', risk:'medium'}) MERGE (c6:Customer {id:'C-006', name:'Lucas Ferreira', risk:'high'}) // Accounts MERGE (a1:Account {id:'A-101', balance:250.0, status:'flagged'}) MERGE (a2:Account {id:'A-102', balance:1200.0, status:'active'}) MERGE (a3:Account {id:'A-890', balance:180.0, status:'flagged'}) MERGE (a4:Account {id:'A-445', balance:3100.0, status:'mule'}) MERGE (a5:Account {id:'A-667', balance:2950.0, status:'mule'}) MERGE (a6:Account {id:'A-201', balance:500.0, status:'active'}) // Devices MERGE (d1:Device {id:'D-441', type:'mobile', os:'Android', fingerprint:'abc123'}) MERGE (d2:Device {id:'D-882', type:'emulator', os:'Android', fingerprint:'xxx999'}) MERGE (d3:Device {id:'D-103', type:'desktop', os:'Windows', fingerprint:'win456'}) // IPs MERGE (ip1:IP {address:'177.82.11.3', country:'BR', vpn:false}) MERGE (ip2:IP {address:'192.168.1.44', country:'BR', vpn:true}) MERGE (ip3:IP {address:'201.55.3.12', country:'BR', vpn:false}) // Merchants MERGE (m1:Merchant {id:'MKT-031', name:'QuickShop', category:'retail', micro_tx:47}) MERGE (m2:Merchant {id:'MKT-088', name:'FastPay', category:'digital', micro_tx:33}) MERGE (m3:Merchant {id:'MKT-201', name:'EasyStore', category:'retail', micro_tx:28}) // Customer → Account MERGE (c1)-[:HAS_ACCOUNT]->(a1) MERGE (c2)-[:HAS_ACCOUNT]->(a2) MERGE (c3)-[:HAS_ACCOUNT]->(a3) MERGE (c4)-[:HAS_ACCOUNT]->(a6) MERGE (c5)-[:HAS_ACCOUNT]->(a4) MERGE (c6)-[:HAS_ACCOUNT]->(a5) // Customer → Device MERGE (c1)-[:USED {last_seen:'2024-01-15'}]->(d1) MERGE (c2)-[:USED {last_seen:'2024-01-14'}]->(d1) MERGE (c3)-[:USED {last_seen:'2024-01-15'}]->(d1) MERGE (c4)-[:USED {last_seen:'2024-01-10'}]->(d2) MERGE (c5)-[:USED {last_seen:'2024-01-15'}]->(d2) // Account → IP MERGE (a1)-[:ACCESSED_FROM {count:12, last_seen:'2024-01-15'}]->(ip1) MERGE (a2)-[:ACCESSED_FROM {count:3, last_seen:'2024-01-14'}]->(ip1) MERGE (a3)-[:ACCESSED_FROM {count:8, last_seen:'2024-01-15'}]->(ip1) MERGE (a4)-[:ACCESSED_FROM {count:5, last_seen:'2024-01-13'}]->(ip2) MERGE (a5)-[:ACCESSED_FROM {count:7, last_seen:'2024-01-15'}]->(ip2) MERGE (a6)-[:ACCESSED_FROM {count:2, last_seen:'2024-01-10'}]->(ip3) // Money mule transfers MERGE (a2)-[:TRANSFER {amount:3200.0, date:'2024-01-15', hour:'10:00'}]->(a4) MERGE (a4)-[:TRANSFER {amount:3100.0, date:'2024-01-15', hour:'11:30'}]->(a5) MERGE (a5)-[:TRANSFER {amount:2950.0, date:'2024-01-15', hour:'14:00'}]->(a3) // Micro-transactions (card testing) MERGE (a1)-[:TRANSACTION {amount:2.99, type:'card_test'}]->(m1) MERGE (a3)-[:TRANSACTION {amount:1.50, type:'card_test'}]->(m1) MERGE (a1)-[:TRANSACTION {amount:3.00, type:'card_test'}]->(m2) MERGE (a4)-[:TRANSACTION {amount:4.99, type:'card_test'}]->(m2) MERGE (a5)-[:TRANSACTION {amount:2.00, type:'card_test'}]->(m3) """ def seed_database(driver): with get_session(driver, "e3b8c8ec") as session: for stmt in SEED_CYPHER.strip().split(';'): stmt = stmt.strip() if stmt: session.run(stmt) # ───────────────────────────────────────────── # GROQ: GENERATE CYPHER FROM NATURAL LANGUAGE # ───────────────────────────────────────────── SCHEMA = """ Graph schema: Nodes: Customer {id, name, risk}, Account {id, balance, status}, Device {id, type, os}, IP {address, country, vpn}, Merchant {id, name, category, micro_tx} Relationships: (Customer)-[:HAS_ACCOUNT]->(Account), (Customer)-[:USED]->(Device), (Account)-[:ACCESSED_FROM]->(IP), (Account)-[:TRANSFER {amount, date}]->(Account), (Account)-[:TRANSACTION {amount, type}]->(Merchant) """ def generate_cypher(groq_client, question: str) -> dict: prompt = f"""You are a Neo4j Cypher expert for fraud detection. {SCHEMA} Generate a Cypher query to answer: "{question}" Respond ONLY with a valid JSON object, no markdown, no explanation: {{"cypher": "MATCH ... RETURN ...", "explanation": "brief explanation in English", "fraud_pattern": "pattern name"}}""" response = groq_client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=400, ) raw = response.choices[0].message.content.strip() raw = re.sub(r"```json|```", "", raw).strip() try: return json.loads(raw) except Exception: # Fallback: extract cypher with regex m = re.search(r'"cypher"\s*:\s*"([^"]+)"', raw) cypher = m.group(1) if m else "MATCH (n) RETURN n LIMIT 5" return {"cypher": cypher, "explanation": raw[:200], "fraud_pattern": "unknown"} # ───────────────────────────────────────────── # EXECUTE CYPHER + FORMAT RESULT # ───────────────────────────────────────────── def run_cypher(driver, cypher: str): try: _, database = get_neo4j_driver() with get_session(driver, database or "e3b8c8ec") as session: result = session.run(cypher) records = [dict(r) for r in result] return records, None except Exception as e: return [], str(e) def groq_summarize(groq_client, question, records, fraud_pattern): data_str = json.dumps(records[:10], default=str) prompt = f"""You are a fraud analyst AI. Given this question and Neo4j query result, write a concise 2-3 sentence analysis highlighting the fraud risk. Question: {question} Fraud pattern: {fraud_pattern} Data: {data_str} Be direct, mention specific IDs/amounts if available. Flag risk level.""" response = groq_client.chat.completions.create( model="llama-3.1-8b-instant", messages=[{"role": "user", "content": prompt}], temperature=0.3, max_tokens=200, ) return response.choices[0].message.content.strip() # ───────────────────────────────────────────── # VECTOR RAG SIMULATION (realistic mock) # ───────────────────────────────────────────── VECTOR_RESPONSES = { "device": { "steps": [ "Embedding query → 1536-dim vector", "Cosine similarity in ChromaDB", "Top-3 docs retrieved (sim: 0.72, 0.68, 0.61)", "LLM generates answer from chunks", ], "answer": "Based on retrieved transaction documents, device D-441 appears in some records from mid-January. Several customer profiles show mobile device usage. Exact relationship mapping between customers and this specific device is not available in the document corpus.", "metrics": [("Precision", "38%", False), ("Latency", "340ms", False), ("Graph context", "None", False), ("Hallucination risk", "High", False)], }, "ip": { "steps": [ "Embedding query → 1536-dim vector", "Cosine similarity in ChromaDB", "Top-3 docs retrieved (sim: 0.69, 0.65, 0.58)", "LLM generates answer from chunks", ], "answer": "Transaction records mention IP addresses in the 177.x.x.x range appearing multiple times. Some accounts may share network access points based on similar location data in the documents. Date filtering was not possible with the available embeddings.", "metrics": [("Precision", "25%", False), ("Date filter", "Impossible", False), ("Relationship depth", "0 hops", False), ("Context", "Partial", False)], }, "transfer": { "steps": [ "Embedding query → 1536-dim vector", "Cosine similarity in ChromaDB", "Top-3 docs retrieved (sim: 0.74, 0.70, 0.63)", "LLM generates answer from chunks", ], "answer": "Account documents show transfer activity between A-102 and A-890. Intermediate accounts A-445 and A-667 appear in separate transaction records. Whether these constitute a connected layering chain cannot be determined from document embeddings alone.", "metrics": [("Path finding", "Not possible", False), ("Intermediate nodes", "Missed", False), ("AML detection", "Partial", False), ("Confidence", "Low", False)], }, "merchant": { "steps": [ "Embedding query → 1536-dim vector", "Cosine similarity in ChromaDB", "Top-3 docs retrieved (sim: 0.81, 0.75, 0.70)", "LLM generates answer from chunks", ], "answer": "Merchants MKT-031 and MKT-088 appear in fraud reports mentioning small-value transactions. Card testing patterns are referenced in 2 of the 3 retrieved documents, though specific transaction counts and real-time velocity cannot be verified.", "metrics": [("Recall", "52%", False), ("Real-time", "No", False), ("Velocity check", "Impossible", False), ("Actionable", "Partial", False)], }, } def vector_rag_response(question: str): q = question.lower() if "device" in q or "d-441" in q: return VECTOR_RESPONSES["device"] elif "ip" in q or "address" in q: return VECTOR_RESPONSES["ip"] elif "transfer" in q or "mule" in q or "path" in q: return VECTOR_RESPONSES["transfer"] elif "merchant" in q or "card test" in q: return VECTOR_RESPONSES["merchant"] else: return VECTOR_RESPONSES["device"] # ───────────────────────────────────────────── # GRAPH VISUALIZER # ───────────────────────────────────────────── def build_graph_html(driver, limit=60): net = Network(height="380px", width="100%", bgcolor="#0A0F1E", font_color="#C8D8E8", directed=True) net.set_options("""{"physics":{"stabilization":{"iterations":80},"barnesHut":{"gravitationalConstant":-3000}}}""") COLOR_MAP = { "Customer": "#42A5F5", "Account": "#69F0AE", "Device": "#FFD54F", "IP": "#FF6B6B", "Merchant": "#CE93D8", } _, database = get_neo4j_driver() with get_session(driver, database or "e3b8c8ec") as session: nodes_q = "MATCH (n) RETURN n, labels(n) AS lbl LIMIT $limit" edges_q = "MATCH (a)-[r]->(b) RETURN id(a) AS src, id(b) AS tgt, type(r) AS rel, r LIMIT $limit" seen = set() for rec in session.run(nodes_q, limit=limit): n = rec["n"] lbl = rec["lbl"][0] if rec["lbl"] else "Node" nid = str(n.id) if nid in seen: continue seen.add(nid) props = dict(n) label = props.get("name", props.get("id", props.get("address", nid))) title = "\n".join(f"{k}: {v}" for k, v in props.items()) color = COLOR_MAP.get(lbl, "#888") size = 18 if lbl in ("Customer", "Account") else 13 net.add_node(nid, label=f"{lbl}\n{label}", color=color, size=size, title=title) for rec in session.run(edges_q, limit=limit): src, tgt = str(rec["src"]), str(rec["tgt"]) rel = rec["rel"] r_props = dict(rec["r"]) title = "\n".join(f"{k}: {v}" for k, v in r_props.items()) color = "#1565C0" if rel == "TRANSFER" else "#444" net.add_edge(src, tgt, label=rel, color=color, title=title) with tempfile.NamedTemporaryFile(suffix=".html", delete=False, mode="w") as f: net.save_graph(f.name) return open(f.name).read() # ───────────────────────────────────────────── # SIDEBAR # ───────────────────────────────────────────── with st.sidebar: st.markdown("""