Spaces:

yugbirla
/

GraphResearcher

Sleeping

File size: 21,065 Bytes

b7d0804

from pathlib import Path

# =====================================================
# Graph schemas
# =====================================================

Path("app/schemas/graph_schema.py").write_text(r'''
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from datetime import datetime, timezone


class GraphEntity(BaseModel):
    entity_id: str
    name: str
    entity_type: str = "CONCEPT"

    mention_count: int = 0
    pages: List[int] = Field(default_factory=list)
    chunk_ids: List[str] = Field(default_factory=list)

    aliases: List[str] = Field(default_factory=list)
    evidence: List[Dict[str, Any]] = Field(default_factory=list)


class GraphRelation(BaseModel):
    relation_id: str
    source_entity_id: str
    target_entity_id: str

    source_name: str
    target_name: str

    relation_type: str = "RELATED_TO"
    weight: int = 1

    pages: List[int] = Field(default_factory=list)
    chunk_ids: List[str] = Field(default_factory=list)
    evidence: List[Dict[str, Any]] = Field(default_factory=list)


class DocumentGraph(BaseModel):
    document_id: str
    source_file_name: Optional[str] = None

    total_entities: int = 0
    total_relations: int = 0

    entities: List[GraphEntity] = Field(default_factory=list)
    relations: List[GraphRelation] = Field(default_factory=list)

    build_metadata: Dict[str, Any] = Field(default_factory=dict)

    created_at: str = Field(
        default_factory=lambda: datetime.now(timezone.utc).isoformat()
    )
''', encoding="utf-8")


# =====================================================
# Entity extractor
# =====================================================

Path("app/graph/entity_extractor.py").write_text(r'''
import re
from typing import List, Dict, Any


STOP_ENTITIES = {
    "The", "This", "That", "These", "Those", "It", "They", "We", "You",
    "Page", "Chapter", "Figure", "Table", "Example", "Answer", "Question",
    "Introduction", "Conclusion", "Summary", "Overview"
}


def normalize_entity_name(name: str) -> str:
    name = re.sub(r"\s+", " ", name or "").strip()
    name = name.strip(".,;:()[]{}")
    return name


def make_entity_id(name: str) -> str:
    cleaned = name.lower()
    cleaned = re.sub(r"[^a-z0-9]+", "_", cleaned)
    cleaned = cleaned.strip("_")
    return cleaned[:80] or "unknown_entity"


def classify_entity(name: str) -> str:
    if re.fullmatch(r"[A-Z][A-Z0-9]{1,9}", name):
        return "ACRONYM"

    org_markers = [
        "University", "Institute", "Corporation", "Corp", "Inc", "Ltd",
        "Company", "OpenAI", "Microsoft", "Google", "Amazon"
    ]

    if any(marker.lower() in name.lower() for marker in org_markers):
        return "ORGANIZATION"

    if any(char.isdigit() for char in name):
        return "TECHNICAL_TERM"

    if "-" in name or "/" in name:
        return "TECHNICAL_TERM"

    return "CONCEPT"


def is_valid_entity(name: str) -> bool:
    if not name:
        return False

    if name in STOP_ENTITIES:
        return False

    if len(name) < 2:
        return False

    if len(name) > 80:
        return False

    if name.lower() in {"and", "or", "but", "with", "from", "into"}:
        return False

    return True


def extract_entities_from_text(text: str) -> List[Dict[str, Any]]:
    if not text:
        return []

    candidates = []

    # Acronyms like RAG, LLM, API, OCR
    for match in re.finditer(r"\b[A-Z][A-Z0-9]{1,9}\b", text):
        candidates.append(match.group(0))

    # Capitalized technical phrases like Retrieval-Augmented Generation
    capitalized_phrase_pattern = (
        r"\b[A-Z][a-zA-Z0-9]*(?:[-/][A-Z]?[a-zA-Z0-9]+)?"
        r"(?:\s+[A-Z][a-zA-Z0-9]*(?:[-/][A-Z]?[a-zA-Z0-9]+)?){0,5}\b"
    )

    for match in re.finditer(capitalized_phrase_pattern, text):
        candidates.append(match.group(0))

    cleaned_entities = []

    seen = set()

    for candidate in candidates:
        name = normalize_entity_name(candidate)

        if not is_valid_entity(name):
            continue

        entity_id = make_entity_id(name)

        if entity_id in seen:
            continue

        seen.add(entity_id)

        cleaned_entities.append(
            {
                "entity_id": entity_id,
                "name": name,
                "entity_type": classify_entity(name)
            }
        )

    return cleaned_entities


def split_sentences(text: str) -> List[str]:
    if not text:
        return []

    parts = re.split(r"(?<=[.!?])\s+", text)
    return [part.strip() for part in parts if len(part.strip()) > 20]
''', encoding="utf-8")


# =====================================================
# Relation extractor
# =====================================================

Path("app/graph/relation_extractor.py").write_text(r'''
import itertools
import re
from typing import List, Dict, Any

from app.graph.entity_extractor import make_entity_id, split_sentences


VERB_RELATION_MAP = {
    "stands for": "STANDS_FOR",
    "refers to": "REFERS_TO",
    "uses": "USES",
    "use": "USES",
    "retrieves": "RETRIEVES",
    "retrieve": "RETRIEVES",
    "generates": "GENERATES",
    "generate": "GENERATES",
    "provides": "PROVIDES",
    "provide": "PROVIDES",
    "reduces": "REDUCES",
    "reduce": "REDUCES",
    "improves": "IMPROVES",
    "improve": "IMPROVES",
    "contains": "CONTAINS",
    "include": "INCLUDES",
    "includes": "INCLUDES",
    "is": "IS_A",
    "are": "IS_A"
}


def relation_id(source_id: str, relation_type: str, target_id: str) -> str:
    return f"{source_id}__{relation_type.lower()}__{target_id}"[:160]


def entity_appears_in_sentence(entity_name: str, sentence: str) -> bool:
    pattern = r"\b" + re.escape(entity_name) + r"\b"
    return re.search(pattern, sentence, flags=re.IGNORECASE) is not None


def extract_relations_from_text(
    text: str,
    entities: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:

    if not text or len(entities) < 2:
        return []

    relations = []
    sentences = split_sentences(text)

    for sentence in sentences:
        present_entities = [
            entity for entity in entities
            if entity_appears_in_sentence(entity["name"], sentence)
        ]

        # Avoid relation explosion
        present_entities = present_entities[:6]

        if len(present_entities) < 2:
            continue

        relation_type = detect_relation_type(sentence)

        for source, target in itertools.combinations(present_entities, 2):
            if source["entity_id"] == target["entity_id"]:
                continue

            relations.append(
                {
                    "relation_id": relation_id(
                        source["entity_id"],
                        relation_type,
                        target["entity_id"]
                    ),
                    "source_entity_id": source["entity_id"],
                    "target_entity_id": target["entity_id"],
                    "source_name": source["name"],
                    "target_name": target["name"],
                    "relation_type": relation_type,
                    "evidence_sentence": sentence
                }
            )

    return relations


def detect_relation_type(sentence: str) -> str:
    sentence_lower = sentence.lower()

    for phrase, relation_type in VERB_RELATION_MAP.items():
        if phrase in sentence_lower:
            return relation_type

    return "RELATED_TO"
''', encoding="utf-8")


# =====================================================
# Graph storage
# =====================================================

Path("app/graph/graph_storage.py").write_text(r'''
import json
from typing import Optional

from app.core.config import settings
from app.schemas.graph_schema import DocumentGraph


def get_graph_path(document_id: str):
    document_dir = settings.PROCESSED_DIR / document_id
    document_dir.mkdir(parents=True, exist_ok=True)
    return document_dir / "graph.json"


def save_document_graph(graph: DocumentGraph) -> None:
    graph_path = get_graph_path(graph.document_id)

    with open(graph_path, "w", encoding="utf-8") as f:
        json.dump(
            graph.model_dump(),
            f,
            indent=2,
            ensure_ascii=False
        )


def read_document_graph(document_id: str) -> Optional[DocumentGraph]:
    graph_path = get_graph_path(document_id)

    if not graph_path.exists():
        return None

    with open(graph_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    return DocumentGraph(**data)
''', encoding="utf-8")


# =====================================================
# Graph builder
# =====================================================

Path("app/graph/graph_builder.py").write_text(r'''
from datetime import datetime, timezone
from typing import Dict, Any, List

from app.storage.processed_storage import (
    read_processed_chunks,
    read_processed_metadata
)
from app.schemas.graph_schema import (
    DocumentGraph,
    GraphEntity,
    GraphRelation
)
from app.graph.entity_extractor import extract_entities_from_text
from app.graph.relation_extractor import extract_relations_from_text
from app.graph.graph_storage import save_document_graph


def get_value(obj, key: str, default=None):
    if isinstance(obj, dict):
        return obj.get(key, default)

    return getattr(obj, key, default)


def add_unique(existing_list: List, value):
    if value is None:
        return

    if value not in existing_list:
        existing_list.append(value)


def build_document_graph(document_id: str) -> Dict[str, Any]:
    chunks = read_processed_chunks(document_id)

    if chunks is None:
        return {
            "status": "failed",
            "message": "No processed chunks found for this document. Upload and process the document first.",
            "document_id": document_id
        }

    metadata = read_processed_metadata(document_id) or {}
    source_file_name = None

    if isinstance(metadata, dict):
        source_file_name = metadata.get("source_file_name") or metadata.get("filename")

    entity_map: Dict[str, GraphEntity] = {}
    relation_map: Dict[str, GraphRelation] = {}

    for chunk in chunks:
        content = (
            get_value(chunk, "content")
            or get_value(chunk, "text")
            or ""
        )

        if not content:
            continue

        chunk_id = get_value(chunk, "chunk_id", "")
        page_number = get_value(chunk, "page_number", None)

        extracted_entities = extract_entities_from_text(content)

        for item in extracted_entities:
            entity_id = item["entity_id"]

            if entity_id not in entity_map:
                entity_map[entity_id] = GraphEntity(
                    entity_id=entity_id,
                    name=item["name"],
                    entity_type=item["entity_type"],
                    mention_count=0
                )

            entity = entity_map[entity_id]
            entity.mention_count += content.lower().count(item["name"].lower())

            add_unique(entity.chunk_ids, chunk_id)
            add_unique(entity.pages, page_number)

            if len(entity.evidence) < 5:
                entity.evidence.append(
                    {
                        "chunk_id": chunk_id,
                        "page_number": page_number,
                        "text_preview": content[:250]
                    }
                )

        extracted_relations = extract_relations_from_text(
            text=content,
            entities=extracted_entities
        )

        for item in extracted_relations:
            rel_id = item["relation_id"]

            if rel_id not in relation_map:
                relation_map[rel_id] = GraphRelation(
                    relation_id=rel_id,
                    source_entity_id=item["source_entity_id"],
                    target_entity_id=item["target_entity_id"],
                    source_name=item["source_name"],
                    target_name=item["target_name"],
                    relation_type=item["relation_type"],
                    weight=0
                )

            relation = relation_map[rel_id]
            relation.weight += 1

            add_unique(relation.chunk_ids, chunk_id)
            add_unique(relation.pages, page_number)

            if len(relation.evidence) < 5:
                relation.evidence.append(
                    {
                        "chunk_id": chunk_id,
                        "page_number": page_number,
                        "sentence": item["evidence_sentence"]
                    }
                )

    entities = sorted(
        entity_map.values(),
        key=lambda entity: entity.mention_count,
        reverse=True
    )

    relations = sorted(
        relation_map.values(),
        key=lambda relation: relation.weight,
        reverse=True
    )

    graph = DocumentGraph(
        document_id=document_id,
        source_file_name=source_file_name,
        total_entities=len(entities),
        total_relations=len(relations),
        entities=entities,
        relations=relations,
        build_metadata={
            "builder": "rule_based_entity_relation_extractor",
            "created_at": datetime.now(timezone.utc).isoformat(),
            "chunk_count": len(chunks),
            "note": "This is the graph foundation layer before adding a dedicated graph database."
        }
    )

    save_document_graph(graph)

    return {
        "status": "success",
        "message": "Document graph built successfully.",
        "document_id": document_id,
        "total_entities": graph.total_entities,
        "total_relations": graph.total_relations,
        "top_entities": [
            {
                "entity_id": entity.entity_id,
                "name": entity.name,
                "type": entity.entity_type,
                "mention_count": entity.mention_count
            }
            for entity in entities[:15]
        ],
        "top_relations": [
            {
                "source": relation.source_name,
                "relation": relation.relation_type,
                "target": relation.target_name,
                "weight": relation.weight
            }
            for relation in relations[:15]
        ]
    }
''', encoding="utf-8")


# =====================================================
# Graph query service
# =====================================================

Path("app/graph/graph_query_service.py").write_text(r'''
from typing import Dict, Any, Optional

from app.graph.graph_storage import read_document_graph


def list_entities(
    document_id: str,
    limit: int = 50,
    entity_type: Optional[str] = None
) -> Dict[str, Any]:

    graph = read_document_graph(document_id)

    if graph is None:
        return {
            "status": "failed",
            "message": "Graph not found. Build the graph first.",
            "entities": []
        }

    entities = graph.entities

    if entity_type:
        entities = [
            entity for entity in entities
            if entity.entity_type.lower() == entity_type.lower()
        ]

    return {
        "status": "success",
        "document_id": document_id,
        "total_entities": len(entities),
        "returned_entities": len(entities[:limit]),
        "entities": entities[:limit]
    }


def search_entities(
    document_id: str,
    query: str,
    limit: int = 20
) -> Dict[str, Any]:

    graph = read_document_graph(document_id)

    if graph is None:
        return {
            "status": "failed",
            "message": "Graph not found. Build the graph first.",
            "entities": []
        }

    query_lower = query.lower().strip()

    matched = [
        entity for entity in graph.entities
        if query_lower in entity.name.lower()
        or query_lower in entity.entity_id.lower()
    ]

    return {
        "status": "success",
        "document_id": document_id,
        "query": query,
        "total_matches": len(matched),
        "entities": matched[:limit]
    }


def get_entity_neighborhood(
    document_id: str,
    entity: str,
    limit: int = 50
) -> Dict[str, Any]:

    graph = read_document_graph(document_id)

    if graph is None:
        return {
            "status": "failed",
            "message": "Graph not found. Build the graph first.",
            "nodes": [],
            "edges": []
        }

    entity_lower = entity.lower().strip()

    matched_entity = None

    for item in graph.entities:
        if (
            item.entity_id.lower() == entity_lower
            or item.name.lower() == entity_lower
            or entity_lower in item.name.lower()
        ):
            matched_entity = item
            break

    if matched_entity is None:
        return {
            "status": "failed",
            "message": "Entity not found in graph.",
            "entity": entity,
            "nodes": [],
            "edges": []
        }

    related_edges = []

    for relation in graph.relations:
        if (
            relation.source_entity_id == matched_entity.entity_id
            or relation.target_entity_id == matched_entity.entity_id
        ):
            related_edges.append(relation)

    related_edges = related_edges[:limit]

    node_ids = {matched_entity.entity_id}

    for edge in related_edges:
        node_ids.add(edge.source_entity_id)
        node_ids.add(edge.target_entity_id)

    nodes = [
        graph_entity for graph_entity in graph.entities
        if graph_entity.entity_id in node_ids
    ]

    return {
        "status": "success",
        "document_id": document_id,
        "center_entity": matched_entity,
        "total_related_edges": len(related_edges),
        "nodes": nodes,
        "edges": related_edges
    }
''', encoding="utf-8")


# =====================================================
# Patch main.py
# =====================================================

main_path = Path("app/main.py")
text = main_path.read_text(encoding="utf-8")

graph_imports = '''from app.graph.graph_builder import build_document_graph
from app.graph.graph_storage import read_document_graph
from app.graph.graph_query_service import (
    list_entities,
    search_entities,
    get_entity_neighborhood
)
'''

if "from app.graph.graph_builder import build_document_graph" not in text:
    text = graph_imports + text

old_phases = [
    "Phase 13 - Deployment Demo Stabilization",
    "Phase 12 - Hugging Face Hosted LLM Provider Hardening",
    "Phase 11 - Hugging Face Deployment Readiness",
    "Phase 10 - LLM Provider Abstraction",
    "Phase 9 - Answer Evaluation System",
    "Phase 8 - Retrieval Evaluation System"
]

for old in old_phases:
    text = text.replace(old, "Phase 14 - Graph Foundation Entity Relation Extraction")

if "# Graph foundation endpoints" not in text:
    text += '''

# Graph foundation endpoints

@app.post("/documents/{document_id}/graph/build")
def build_graph_for_document(document_id: str):
    result = build_document_graph(document_id)

    if result.get("status") == "failed":
        raise HTTPException(
            status_code=400,
            detail=result.get("message", "Graph build failed.")
        )

    return result


@app.get("/documents/{document_id}/graph")
def get_document_graph(document_id: str):
    graph = read_document_graph(document_id)

    if graph is None:
        raise HTTPException(
            status_code=404,
            detail="Graph not found. Build the graph first."
        )

    return graph


@app.get("/documents/{document_id}/graph/entities")
def get_graph_entities(
    document_id: str,
    limit: int = Query(50, ge=1, le=500),
    entity_type: Optional[str] = None
):
    result = list_entities(
        document_id=document_id,
        limit=limit,
        entity_type=entity_type
    )

    if result.get("status") == "failed":
        raise HTTPException(
            status_code=404,
            detail=result.get("message")
        )

    return result


@app.get("/documents/{document_id}/graph/search")
def search_graph_entities(
    document_id: str,
    query: str = Query(..., min_length=1),
    limit: int = Query(20, ge=1, le=100)
):
    result = search_entities(
        document_id=document_id,
        query=query,
        limit=limit
    )

    if result.get("status") == "failed":
        raise HTTPException(
            status_code=404,
            detail=result.get("message")
        )

    return result


@app.get("/documents/{document_id}/graph/neighborhood")
def get_graph_neighborhood(
    document_id: str,
    entity: str = Query(..., min_length=1),
    limit: int = Query(50, ge=1, le=200)
):
    result = get_entity_neighborhood(
        document_id=document_id,
        entity=entity,
        limit=limit
    )

    if result.get("status") == "failed":
        raise HTTPException(
            status_code=404,
            detail=result.get("message")
        )

    return result
'''

main_path.write_text(text, encoding="utf-8")

print("Phase 14 graph foundation files created successfully.")