Spaces:

Madras1
/

Numidium

Runtime error

App Files Files Community

Madras1 commited on Jan 11

Commit

8fea8c3

verified ·

1 Parent(s): 2747628

Upload 45 files

Browse files

Files changed (6) hide show

app/api/routes/analyze.py +257 -0
app/config.py +3 -0
app/main.py +2 -1
app/services/nlp/__init__.py +2 -0
app/services/nlp/entity_extractor.py +243 -0
requirements.txt +1 -0

app/api/routes/analyze.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+Analyze API Routes - LLM-based text analysis
+"""
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, Field
+from typing import Optional, List
+from sqlalchemy.orm import Session
+from app.core.database import get_db
+from app.services.nlp import entity_extractor
+from app.models.entity import Entity, Relationship, Event
+from app.schemas.entity import EntityCreate, EntityResponse
+router = APIRouter(prefix="/analyze", tags=["Analysis"])
+class AnalyzeRequest(BaseModel):
+    """Request model for text analysis"""
+    text: str = Field(..., min_length=10, description="Text to analyze")
+    auto_create: bool = Field(default=False, description="Auto-create extracted entities in database")
+class ExtractedEntityResponse(BaseModel):
+    """Response model for an extracted entity"""
+    name: str
+    type: str
+    role: Optional[str] = None
+    aliases: Optional[List[str]] = None
+    description: Optional[str] = None
+    created: bool = False  # Whether it was created in DB
+    entity_id: Optional[str] = None  # DB ID if created
+class ExtractedRelationshipResponse(BaseModel):
+    """Response model for an extracted relationship"""
+    source: str
+    target: str
+    relationship_type: str
+    context: Optional[str] = None
+    created: bool = False
+class ExtractedEventResponse(BaseModel):
+    """Response model for an extracted event"""
+    description: str
+    event_type: Optional[str] = None
+    date: Optional[str] = None
+    location: Optional[str] = None
+    participants: Optional[List[str]] = None
+    created: bool = False
+    event_id: Optional[str] = None
+class AnalyzeResponse(BaseModel):
+    """Response model for analysis"""
+    entities: List[ExtractedEntityResponse]
+    relationships: List[ExtractedRelationshipResponse]
+    events: List[ExtractedEventResponse]
+    stats: dict
+@router.post("", response_model=AnalyzeResponse)
+async def analyze_text(request: AnalyzeRequest):
+    """
+    Analyze text using LLM to extract entities, relationships, and events.
+    Uses Cerebras API with Qwen 3 235B for intelligent extraction.
+    Args:
+        text: Text to analyze (min 10 characters)
+        auto_create: If true, automatically creates entities in the database
+    Returns:
+        Extracted entities, relationships, events, and statistics
+    """
+    try:
+        # Extract using LLM
+        result = await entity_extractor.extract(request.text)
+        # Prepare response
+        entities_response = []
+        relationships_response = []
+        events_response = []
+        created_entities = 0
+        created_relationships = 0
+        created_events = 0
+        db = next(get_db())
+        # Process entities
+        for entity in result.entities:
+            entity_data = ExtractedEntityResponse(
+                name=entity.name,
+                type=entity.type,
+                role=entity.role,
+                aliases=entity.aliases,
+                description=entity.description,
+                created=False
+            )
+            if request.auto_create and entity.name:
+                # Check if entity already exists
+                existing = db.query(Entity).filter(
+                    Entity.name.ilike(f"%{entity.name}%")
+                ).first()
+                if not existing:
+                    # Create new entity
+                    new_entity = Entity(
+                        name=entity.name,
+                        type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person",
+                        description=entity.description or entity.role or "",
+                        source="llm_extraction",
+                        properties={"role": entity.role, "aliases": entity.aliases}
+                    )
+                    db.add(new_entity)
+                    db.commit()
+                    db.refresh(new_entity)
+                    entity_data.created = True
+                    entity_data.entity_id = new_entity.id
+                    created_entities += 1
+                else:
+                    entity_data.entity_id = existing.id
+            entities_response.append(entity_data)
+        # Process relationships
+        for rel in result.relationships:
+            rel_data = ExtractedRelationshipResponse(
+                source=rel.source,
+                target=rel.target,
+                relationship_type=rel.relationship_type,
+                context=rel.context,
+                created=False
+            )
+            if request.auto_create:
+                # Find source and target entities
+                source_entity = db.query(Entity).filter(
+                    Entity.name.ilike(f"%{rel.source}%")
+                ).first()
+                target_entity = db.query(Entity).filter(
+                    Entity.name.ilike(f"%{rel.target}%")
+                ).first()
+                if source_entity and target_entity:
+                    # Check if relationship exists
+                    existing_rel = db.query(Relationship).filter(
+                        Relationship.source_id == source_entity.id,
+                        Relationship.target_id == target_entity.id,
+                        Relationship.relationship_type == rel.relationship_type
+                    ).first()
+                    if not existing_rel:
+                        new_rel = Relationship(
+                            source_id=source_entity.id,
+                            target_id=target_entity.id,
+                            relationship_type=rel.relationship_type,
+                            description=rel.context
+                        )
+                        db.add(new_rel)
+                        db.commit()
+                        rel_data.created = True
+                        created_relationships += 1
+            relationships_response.append(rel_data)
+        # Process events
+        for event in result.events:
+            event_data = ExtractedEventResponse(
+                description=event.description,
+                event_type=event.event_type,
+                date=event.date,
+                location=event.location,
+                participants=event.participants,
+                created=False
+            )
+            if request.auto_create and event.description:
+                # Create event
+                new_event = Event(
+                    title=event.description[:100] if len(event.description) > 100 else event.description,
+                    description=event.description,
+                    event_type=event.event_type or "general",
+                    source="llm_extraction"
+                )
+                db.add(new_event)
+                db.commit()
+                db.refresh(new_event)
+                event_data.created = True
+                event_data.event_id = new_event.id
+                created_events += 1
+            events_response.append(event_data)
+        return AnalyzeResponse(
+            entities=entities_response,
+            relationships=relationships_response,
+            events=events_response,
+            stats={
+                "total_entities": len(entities_response),
+                "total_relationships": len(relationships_response),
+                "total_events": len(events_response),
+                "created_entities": created_entities,
+                "created_relationships": created_relationships,
+                "created_events": created_events
+            }
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+@router.post("/quick")
+async def quick_analyze(request: AnalyzeRequest):
+    """
+    Quick analysis without database operations.
+    Returns only extracted data without creating anything.
+    """
+    try:
+        result = await entity_extractor.extract(request.text)
+        return {
+            "entities": [
+                {
+                    "name": e.name,
+                    "type": e.type,
+                    "role": e.role,
+                    "aliases": e.aliases
+                }
+                for e in result.entities
+            ],
+            "relationships": [
+                {
+                    "source": r.source,
+                    "target": r.target,
+                    "type": r.relationship_type,
+                    "context": r.context
+                }
+                for r in result.relationships
+            ],
+            "events": [
+                {
+                    "description": ev.description,
+                    "type": ev.event_type,
+                    "date": ev.date,
+                    "participants": ev.participants
+                }
+                for ev in result.events
+            ]
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")

app/config.py CHANGED Viewed

@@ -20,6 +20,9 @@ class Settings(BaseSettings):
     # APIs (opcional - pode configurar depois)
     newsapi_key: str = ""
     # CORS
     cors_origins: list[str] = ["*"]

     # APIs (opcional - pode configurar depois)
     newsapi_key: str = ""
+    # Cerebras API for LLM-based entity extraction
+    cerebras_api_key: str = ""
     # CORS
     cors_origins: list[str] = ["*"]

app/main.py CHANGED Viewed

@@ -8,7 +8,7 @@ from contextlib import asynccontextmanager
 from app.config import settings
 from app.core.database import init_db
-from app.api.routes import entities, relationships, events, search, ingest
 @asynccontextmanager
@@ -55,6 +55,7 @@ app.include_router(relationships.router, prefix="/api/v1")
 app.include_router(events.router, prefix="/api/v1")
 app.include_router(search.router, prefix="/api/v1")
 app.include_router(ingest.router, prefix="/api/v1")
 @app.get("/")

 from app.config import settings
 from app.core.database import init_db
+from app.api.routes import entities, relationships, events, search, ingest, analyze
 @asynccontextmanager
 app.include_router(events.router, prefix="/api/v1")
 app.include_router(search.router, prefix="/api/v1")
 app.include_router(ingest.router, prefix="/api/v1")
+app.include_router(analyze.router, prefix="/api/v1")
 @app.get("/")

app/services/nlp/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # NLP Services
2	+ from .entity_extractor import entity_extractor

app/services/nlp/entity_extractor.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+Entity Extractor Service - LLM-based NER
+Uses Cerebras API with Qwen 3 235B for intelligent entity and relationship extraction
+"""
+import json
+import re
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+import httpx
+from app.config import settings
+@dataclass
+class ExtractedEntity:
+    """Represents an extracted entity"""
+    name: str
+    type: str  # person, organization, location, event
+    role: Optional[str] = None
+    aliases: Optional[List[str]] = None
+    description: Optional[str] = None
+    latitude: Optional[float] = None
+    longitude: Optional[float] = None
+@dataclass
+class ExtractedRelationship:
+    """Represents a relationship between entities"""
+    source: str
+    target: str
+    relationship_type: str
+    context: Optional[str] = None
+@dataclass
+class ExtractedEvent:
+    """Represents an extracted event"""
+    description: str
+    event_type: Optional[str] = None
+    date: Optional[str] = None
+    location: Optional[str] = None
+    participants: Optional[List[str]] = None
+@dataclass
+class ExtractionResult:
+    """Complete extraction result"""
+    entities: List[ExtractedEntity]
+    relationships: List[ExtractedRelationship]
+    events: List[ExtractedEvent]
+    raw_response: Optional[str] = None
+EXTRACTION_PROMPT = """Você é um especialista em extração de informações estruturadas de textos.
+Analise o texto fornecido e extraia TODAS as entidades, relacionamentos e eventos mencionados.
+## Regras:
+1. Identifique entidades: pessoas, organizações, locais, eventos
+2. Para PESSOAS: inclua nome completo (se mencionado ou conhecido), cargo/função
+3. Para ORGANIZAÇÕES: inclua nome oficial e siglas
+4. Para LOCAIS: seja específico (cidade, país, endereço)
+5. Identifique RELACIONAMENTOS entre entidades (quem trabalha onde, quem conhece quem, etc.)
+6. Identifique EVENTOS mencionados (reuniões, anúncios, eleições, etc.)
+## Formato de resposta (JSON válido):
+```json
+{
+  "entities": [
+    {
+      "name": "Nome Completo",
+      "type": "person|organization|location|event",
+      "role": "cargo ou função (opcional)",
+      "aliases": ["apelidos", "siglas"],
+      "description": "breve descrição se relevante"
+    }
+  ],
+  "relationships": [
+    {
+      "source": "Nome da Entidade 1",
+      "target": "Nome da Entidade 2",
+      "relationship_type": "tipo de relação (trabalha em, preside, fundou, reuniu-se com, etc.)",
+      "context": "contexto da relação"
+    }
+  ],
+  "events": [
+    {
+      "description": "O que aconteceu",
+      "event_type": "meeting|announcement|election|crime|etc",
+      "date": "data se mencionada",
+      "location": "local se mencionado",
+      "participants": ["lista de participantes"]
+    }
+  ]
+}
+```
+Retorne APENAS o JSON, sem texto adicional.
+## Texto para análise:
+{text}
+"""
+class EntityExtractor:
+    """
+    LLM-based Entity Extractor using Cerebras API
+    """
+    def __init__(self):
+        self.api_key = settings.cerebras_api_key
+        self.base_url = "https://api.cerebras.ai/v1"
+        self.model = "qwen-3-235b-a22b-instruct-2507"
+        self.timeout = 60.0
+    async def extract(self, text: str) -> ExtractionResult:
+        """
+        Extract entities, relationships, and events from text using LLM
+        Args:
+            text: The text to analyze
+        Returns:
+            ExtractionResult with all extracted information
+        """
+        if not self.api_key:
+            raise ValueError("CEREBRAS_API_KEY not configured")
+        if not text or len(text.strip()) < 10:
+            return ExtractionResult(entities=[], relationships=[], events=[])
+        # Prepare the prompt
+        prompt = EXTRACTION_PROMPT.format(text=text)
+        # Call Cerebras API
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            response = await client.post(
+                f"{self.base_url}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": self.model,
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "Você é um assistente especialista em extração de entidades e relacionamentos. Sempre responda em JSON válido."
+                        },
+                        {
+                            "role": "user",
+                            "content": prompt
+                        }
+                    ],
+                    "temperature": 0.1,  # Low temperature for consistent extraction
+                    "max_tokens": 4096
+                }
+            )
+            response.raise_for_status()
+            data = response.json()
+        # Parse the response
+        raw_content = data["choices"][0]["message"]["content"]
+        return self._parse_response(raw_content)
+    def _parse_response(self, content: str) -> ExtractionResult:
+        """Parse the LLM response into structured data"""
+        try:
+            # Try to extract JSON from the response
+            # Sometimes the model wraps it in ```json ... ```
+            json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
+            if json_match:
+                json_str = json_match.group(1)
+            else:
+                # Try to find raw JSON
+                json_match = re.search(r'\{.*\}', content, re.DOTALL)
+                if json_match:
+                    json_str = json_match.group(0)
+                else:
+                    json_str = content
+            data = json.loads(json_str)
+            # Parse entities
+            entities = []
+            for e in data.get("entities", []):
+                entities.append(ExtractedEntity(
+                    name=e.get("name", ""),
+                    type=e.get("type", "unknown"),
+                    role=e.get("role"),
+                    aliases=e.get("aliases", []),
+                    description=e.get("description")
+                ))
+            # Parse relationships
+            relationships = []
+            for r in data.get("relationships", []):
+                relationships.append(ExtractedRelationship(
+                    source=r.get("source", ""),
+                    target=r.get("target", ""),
+                    relationship_type=r.get("relationship_type", "related_to"),
+                    context=r.get("context")
+                ))
+            # Parse events
+            events = []
+            for ev in data.get("events", []):
+                events.append(ExtractedEvent(
+                    description=ev.get("description", ""),
+                    event_type=ev.get("event_type"),
+                    date=ev.get("date"),
+                    location=ev.get("location"),
+                    participants=ev.get("participants", [])
+                ))
+            return ExtractionResult(
+                entities=entities,
+                relationships=relationships,
+                events=events,
+                raw_response=content
+            )
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse LLM response: {e}")
+            print(f"Raw content: {content}")
+            return ExtractionResult(
+                entities=[],
+                relationships=[],
+                events=[],
+                raw_response=content
+            )
+    def extract_sync(self, text: str) -> ExtractionResult:
+        """
+        Synchronous version of extract for non-async contexts
+        """
+        import asyncio
+        return asyncio.run(self.extract(text))
+# Singleton instance
+entity_extractor = EntityExtractor()

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ httpx==0.25.2
 python-multipart==0.0.6
 aiohttp==3.9.1
 feedparser==6.0.10

 python-multipart==0.0.6
 aiohttp==3.9.1
 feedparser==6.0.10
+# httpx already included - used for Cerebras API calls