Spaces:
Running
Running
fix(B-08,NEW-1): remove duplicate fulltext from SETUP_QUERIES; modern CREATE FULLTEXT INDEX syntax
7b8375b | """ | |
| BharatGraph - Neo4j Graph Schema | |
| Defines all node types and relationship types for the knowledge graph. | |
| Graph Model: | |
| (Politician)-[:MEMBER_OF]->(Party) | |
| (Politician)-[:DIRECTOR_OF]->(Company) | |
| (Politician)-[:CONTESTED_IN]->(Election) | |
| (Company)-[:WON_CONTRACT]->(Contract) | |
| (Contract)-[:AWARDED_BY]->(Ministry) | |
| (AuditReport)-[:FLAGS]->(Scheme) | |
| (AuditReport)-[:MENTIONS]->(Ministry) | |
| (PressRelease)-[:ISSUED_BY]->(Ministry) | |
| This is the schema that makes BharatGraph powerful: | |
| - Query: Find all companies where politician X is a director | |
| - Query: Find contracts won by companies linked to politicians | |
| - Query: Show audit reports flagging the same ministry | |
| """ | |
| # -- Node Labels ------------------------------------------- | |
| # Each dict defines the properties a node of that type can have. | |
| NODE_SCHEMAS = { | |
| "Politician": { | |
| "description": "An elected official or political candidate", | |
| "properties": { | |
| "id": "Unique ID (name_election hash)", | |
| "name": "Cleaned full name", | |
| "name_raw": "Original name from source", | |
| "party": "Political party", | |
| "state": "State they represent/contested", | |
| "election": "Election name (Lok Sabha 2024 etc.)", | |
| "total_assets": "Declared assets (string from affidavit)", | |
| "liabilities": "Declared liabilities", | |
| "criminal_cases": "Number of declared criminal cases", | |
| "education": "Declared education qualification", | |
| "source": "Data source (myneta/eci)", | |
| "scraped_at": "When this was scraped", | |
| }, | |
| "required": ["name", "state"], | |
| "indexes": ["name", "state", "party"], | |
| }, | |
| "Company": { | |
| "description": "A registered company (from MCA/corporate registry)", | |
| "properties": { | |
| "id": "CIN (Corporate Identity Number)", | |
| "name": "Cleaned company name", | |
| "name_raw": "Original name from MCA", | |
| "cin": "Corporate Identity Number", | |
| "status": "Active/Inactive/Struck Off", | |
| "state": "State of registration", | |
| "registration_date": "Date of incorporation", | |
| "company_class": "Private/Public/LLP etc.", | |
| "source": "Data source (mca)", | |
| "scraped_at": "When this was scraped", | |
| }, | |
| "required": ["name"], | |
| "indexes": ["name", "cin", "state"], | |
| }, | |
| "Contract": { | |
| "description": "A government procurement contract from GeM", | |
| "properties": { | |
| "id": "Order ID from GeM", | |
| "order_id": "GeM order reference number", | |
| "seller_name": "Company that won the contract", | |
| "buyer_org": "Government department that bought", | |
| "product": "What was procured", | |
| "amount_crore": "Contract value in crore rupees", | |
| "order_date": "Date of contract", | |
| "state": "State of buyer organisation", | |
| "source": "Data source (gem)", | |
| "scraped_at": "When this was scraped", | |
| }, | |
| "required": ["order_id", "seller_name"], | |
| "indexes": ["order_id", "seller_name", "amount_crore"], | |
| }, | |
| "AuditReport": { | |
| "description": "A CAG audit report flagging financial irregularities", | |
| "properties": { | |
| "id": "URL hash", | |
| "title": "Report title", | |
| "url": "Source URL on cag.gov.in", | |
| "year": "Audit year", | |
| "state": "State audited (or National)", | |
| "scheme": "Scheme/programme audited", | |
| "amount_crore": "Amount of irregularities found", | |
| "irregularity_type": "Type of irregularity", | |
| "finding": "Key finding summary", | |
| "alert_keywords": "Fraud keywords found in title", | |
| "source": "Data source (cag)", | |
| "scraped_at": "When this was scraped", | |
| }, | |
| "required": ["title"], | |
| "indexes": ["title", "state", "year"], | |
| }, | |
| "PressRelease": { | |
| "description": "An official government press release from PIB", | |
| "properties": { | |
| "id": "URL hash", | |
| "title": "Press release title", | |
| "link": "Source URL on pib.gov.in", | |
| "published": "Publication date", | |
| "alert_keywords": "Alert keywords found in title", | |
| "source": "Data source (pib)", | |
| "scraped_at": "When this was scraped", | |
| }, | |
| "required": ["title"], | |
| "indexes": ["title"], | |
| }, | |
| "Ministry": { | |
| "description": "A government ministry or department", | |
| "properties": { | |
| "id": "Slug (ministry-of-finance etc.)", | |
| "name": "Full ministry name", | |
| }, | |
| "required": ["name"], | |
| "indexes": ["name"], | |
| }, | |
| "Party": { | |
| "description": "A political party", | |
| "properties": { | |
| "id": "Slug", | |
| "name": "Party name", | |
| }, | |
| "required": ["name"], | |
| "indexes": ["name"], | |
| }, | |
| "Scheme": { | |
| "description": "A government welfare/development scheme", | |
| "properties": { | |
| "id": "Slug", | |
| "name": "Scheme name (MGNREGA, PM-KISAN etc.)", | |
| }, | |
| "required": ["name"], | |
| "indexes": ["name"], | |
| }, | |
| } | |
| # -- Relationship Types ------------------------------------ | |
| RELATIONSHIP_SCHEMAS = { | |
| # B-05 FIX: FILED_AFFIDAVIT relationship was missing | |
| "FILED_AFFIDAVIT": { | |
| "from": "Politician", "to": "Affidavit", | |
| "description": "Politician filed this election affidavit with ECI", | |
| "properties": {"year": "Election year"}, | |
| }, | |
| "MEMBER_OF": { | |
| "from": "Politician", "to": "Party", | |
| "description": "Politician is a member of this party", | |
| "properties": {"since": "Year joined"}, | |
| }, | |
| "DIRECTOR_OF": { | |
| "from": "Politician", "to": "Company", | |
| "description": "Politician is/was a director of this company", | |
| "properties": { | |
| "confidence": "Match confidence score (0-1)", | |
| "source": "How this link was detected", | |
| }, | |
| }, | |
| "WON_CONTRACT": { | |
| "from": "Company", "to": "Contract", | |
| "description": "Company won this procurement contract", | |
| "properties": {"amount_crore": "Contract value"}, | |
| }, | |
| "AWARDED_BY": { | |
| "from": "Contract", "to": "Ministry", | |
| "description": "Contract was awarded by this ministry/dept", | |
| "properties": {}, | |
| }, | |
| "FLAGS": { | |
| "from": "AuditReport", "to": "Scheme", | |
| "description": "Audit report flagged irregularities in this scheme", | |
| "properties": {"amount_crore": "Amount flagged", "year": "Audit year"}, | |
| }, | |
| "AUDITS": { | |
| "from": "AuditReport", "to": "Ministry", | |
| "description": "Audit report covers this ministry", | |
| "properties": {}, | |
| }, | |
| # B-07 FIX: was wrongly pointing to Scheme -- politicians contest | |
| # elections in Constituencies, not government welfare schemes | |
| "CONTESTED_IN": { | |
| "from": "Politician", "to": "Constituency", | |
| "description": "Politician contested election from this constituency", | |
| "properties": {"year": "Election year", "result": "Won/Lost"}, | |
| }, | |
| } | |
| # -- Cypher constraint + index statements ----------------- | |
| # Run these once when setting up a new Neo4j database. | |
| # -- Full-text index (run once) ----------------------------------------------- | |
| # -- Full-text index (informational -- loader.py is the authoritative caller) | |
| # NEW-1 FIX: replaced deprecated CALL db.index.fulltext.createNodeIndex | |
| # (removed in Neo4j 5.x / AuraDB) with modern CREATE FULLTEXT INDEX syntax | |
| FULLTEXT_INDEX_QUERY = ( | |
| "CREATE FULLTEXT INDEX globalSearch IF NOT EXISTS " | |
| "FOR (n:Politician|Company|Contract|AuditReport|Scheme|Ministry|" | |
| " Party|PressRelease|Tender|RegulatoryOrder|EnforcementAction|" | |
| " ElectoralBond|InsolvencyOrder|NGO|CourtCase|LocalBody|Affidavit) " | |
| "ON EACH [n.name, n.title, n.aliases, n.description, n.item_desc, " | |
| " n.buyer_org, n.seller_name, n.ngo_name, n.company_name, " | |
| " n.accused, n.ministry, n.constituency, n.state]" | |
| ) | |
| SETUP_QUERIES = [ | |
| # Uniqueness constraints (also create indexes automatically) | |
| "CREATE CONSTRAINT politician_id IF NOT EXISTS FOR (n:Politician) REQUIRE n.id IS UNIQUE", | |
| "CREATE CONSTRAINT company_id IF NOT EXISTS FOR (n:Company) REQUIRE n.id IS UNIQUE", | |
| "CREATE CONSTRAINT contract_id IF NOT EXISTS FOR (n:Contract) REQUIRE n.id IS UNIQUE", | |
| "CREATE CONSTRAINT audit_id IF NOT EXISTS FOR (n:AuditReport) REQUIRE n.id IS UNIQUE", | |
| "CREATE CONSTRAINT ministry_id IF NOT EXISTS FOR (n:Ministry) REQUIRE n.id IS UNIQUE", | |
| "CREATE CONSTRAINT party_id IF NOT EXISTS FOR (n:Party) REQUIRE n.id IS UNIQUE", | |
| "CREATE CONSTRAINT scheme_id IF NOT EXISTS FOR (n:Scheme) REQUIRE n.id IS UNIQUE", | |
| # B-05 FIX: Affidavit constraint and index | |
| "CREATE CONSTRAINT affidavit_id IF NOT EXISTS FOR (n:Affidavit) REQUIRE n.id IS UNIQUE", | |
| "CREATE INDEX affidavit_year IF NOT EXISTS FOR (n:Affidavit) ON (n.year)", | |
| # Additional indexes for frequent lookups | |
| "CREATE INDEX politician_name IF NOT EXISTS FOR (n:Politician) ON (n.name)", | |
| "CREATE INDEX company_name IF NOT EXISTS FOR (n:Company) ON (n.name)", | |
| "CREATE INDEX contract_date IF NOT EXISTS FOR (n:Contract) ON (n.order_date)", | |
| # B-08 FIX: fulltext index managed by loader.py setup_schema() only | |
| # Removed conflicting 8-type CALL -- loader has the authoritative 20-type version | |
| ] | |
| def print_schema(): | |
| """Print a human-readable summary of the graph schema.""" | |
| print("=" * 55) | |
| print(" BharatGraph -- Neo4j Schema") | |
| print("=" * 55) | |
| print(f"\nNode types ({len(NODE_SCHEMAS)}):") | |
| for label, schema in NODE_SCHEMAS.items(): | |
| props = len(schema["properties"]) | |
| print(f" ({label}) -- {schema['description'][:50]}") | |
| print(f" {props} properties, indexes on: {schema['indexes']}") | |
| print(f"\nRelationship types ({len(RELATIONSHIP_SCHEMAS)}):") | |
| for rel, schema in RELATIONSHIP_SCHEMAS.items(): | |
| print(f" (:{schema['from']})-[:{rel}]->(:{schema['to']})") | |
| print(f" {schema['description']}") | |
| print(f"\nSetup queries: {len(SETUP_QUERIES)} constraints + indexes") | |
| if __name__ == "__main__": | |
| print_schema() | |