Madras1 commited on
Commit
8fea8c3
·
verified ·
1 Parent(s): 2747628

Upload 45 files

Browse files
app/api/routes/analyze.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Analyze API Routes - LLM-based text analysis
3
+ """
4
+ from fastapi import APIRouter, HTTPException
5
+ from pydantic import BaseModel, Field
6
+ from typing import Optional, List
7
+ from sqlalchemy.orm import Session
8
+
9
+ from app.core.database import get_db
10
+ from app.services.nlp import entity_extractor
11
+ from app.models.entity import Entity, Relationship, Event
12
+ from app.schemas.entity import EntityCreate, EntityResponse
13
+
14
+
15
+ router = APIRouter(prefix="/analyze", tags=["Analysis"])
16
+
17
+
18
+ class AnalyzeRequest(BaseModel):
19
+ """Request model for text analysis"""
20
+ text: str = Field(..., min_length=10, description="Text to analyze")
21
+ auto_create: bool = Field(default=False, description="Auto-create extracted entities in database")
22
+
23
+
24
+ class ExtractedEntityResponse(BaseModel):
25
+ """Response model for an extracted entity"""
26
+ name: str
27
+ type: str
28
+ role: Optional[str] = None
29
+ aliases: Optional[List[str]] = None
30
+ description: Optional[str] = None
31
+ created: bool = False # Whether it was created in DB
32
+ entity_id: Optional[str] = None # DB ID if created
33
+
34
+
35
+ class ExtractedRelationshipResponse(BaseModel):
36
+ """Response model for an extracted relationship"""
37
+ source: str
38
+ target: str
39
+ relationship_type: str
40
+ context: Optional[str] = None
41
+ created: bool = False
42
+
43
+
44
+ class ExtractedEventResponse(BaseModel):
45
+ """Response model for an extracted event"""
46
+ description: str
47
+ event_type: Optional[str] = None
48
+ date: Optional[str] = None
49
+ location: Optional[str] = None
50
+ participants: Optional[List[str]] = None
51
+ created: bool = False
52
+ event_id: Optional[str] = None
53
+
54
+
55
+ class AnalyzeResponse(BaseModel):
56
+ """Response model for analysis"""
57
+ entities: List[ExtractedEntityResponse]
58
+ relationships: List[ExtractedRelationshipResponse]
59
+ events: List[ExtractedEventResponse]
60
+ stats: dict
61
+
62
+
63
+ @router.post("", response_model=AnalyzeResponse)
64
+ async def analyze_text(request: AnalyzeRequest):
65
+ """
66
+ Analyze text using LLM to extract entities, relationships, and events.
67
+
68
+ Uses Cerebras API with Qwen 3 235B for intelligent extraction.
69
+
70
+ Args:
71
+ text: Text to analyze (min 10 characters)
72
+ auto_create: If true, automatically creates entities in the database
73
+
74
+ Returns:
75
+ Extracted entities, relationships, events, and statistics
76
+ """
77
+ try:
78
+ # Extract using LLM
79
+ result = await entity_extractor.extract(request.text)
80
+
81
+ # Prepare response
82
+ entities_response = []
83
+ relationships_response = []
84
+ events_response = []
85
+
86
+ created_entities = 0
87
+ created_relationships = 0
88
+ created_events = 0
89
+
90
+ db = next(get_db())
91
+
92
+ # Process entities
93
+ for entity in result.entities:
94
+ entity_data = ExtractedEntityResponse(
95
+ name=entity.name,
96
+ type=entity.type,
97
+ role=entity.role,
98
+ aliases=entity.aliases,
99
+ description=entity.description,
100
+ created=False
101
+ )
102
+
103
+ if request.auto_create and entity.name:
104
+ # Check if entity already exists
105
+ existing = db.query(Entity).filter(
106
+ Entity.name.ilike(f"%{entity.name}%")
107
+ ).first()
108
+
109
+ if not existing:
110
+ # Create new entity
111
+ new_entity = Entity(
112
+ name=entity.name,
113
+ type=entity.type if entity.type in ["person", "organization", "location", "event"] else "person",
114
+ description=entity.description or entity.role or "",
115
+ source="llm_extraction",
116
+ properties={"role": entity.role, "aliases": entity.aliases}
117
+ )
118
+ db.add(new_entity)
119
+ db.commit()
120
+ db.refresh(new_entity)
121
+
122
+ entity_data.created = True
123
+ entity_data.entity_id = new_entity.id
124
+ created_entities += 1
125
+ else:
126
+ entity_data.entity_id = existing.id
127
+
128
+ entities_response.append(entity_data)
129
+
130
+ # Process relationships
131
+ for rel in result.relationships:
132
+ rel_data = ExtractedRelationshipResponse(
133
+ source=rel.source,
134
+ target=rel.target,
135
+ relationship_type=rel.relationship_type,
136
+ context=rel.context,
137
+ created=False
138
+ )
139
+
140
+ if request.auto_create:
141
+ # Find source and target entities
142
+ source_entity = db.query(Entity).filter(
143
+ Entity.name.ilike(f"%{rel.source}%")
144
+ ).first()
145
+ target_entity = db.query(Entity).filter(
146
+ Entity.name.ilike(f"%{rel.target}%")
147
+ ).first()
148
+
149
+ if source_entity and target_entity:
150
+ # Check if relationship exists
151
+ existing_rel = db.query(Relationship).filter(
152
+ Relationship.source_id == source_entity.id,
153
+ Relationship.target_id == target_entity.id,
154
+ Relationship.relationship_type == rel.relationship_type
155
+ ).first()
156
+
157
+ if not existing_rel:
158
+ new_rel = Relationship(
159
+ source_id=source_entity.id,
160
+ target_id=target_entity.id,
161
+ relationship_type=rel.relationship_type,
162
+ description=rel.context
163
+ )
164
+ db.add(new_rel)
165
+ db.commit()
166
+ rel_data.created = True
167
+ created_relationships += 1
168
+
169
+ relationships_response.append(rel_data)
170
+
171
+ # Process events
172
+ for event in result.events:
173
+ event_data = ExtractedEventResponse(
174
+ description=event.description,
175
+ event_type=event.event_type,
176
+ date=event.date,
177
+ location=event.location,
178
+ participants=event.participants,
179
+ created=False
180
+ )
181
+
182
+ if request.auto_create and event.description:
183
+ # Create event
184
+ new_event = Event(
185
+ title=event.description[:100] if len(event.description) > 100 else event.description,
186
+ description=event.description,
187
+ event_type=event.event_type or "general",
188
+ source="llm_extraction"
189
+ )
190
+ db.add(new_event)
191
+ db.commit()
192
+ db.refresh(new_event)
193
+
194
+ event_data.created = True
195
+ event_data.event_id = new_event.id
196
+ created_events += 1
197
+
198
+ events_response.append(event_data)
199
+
200
+ return AnalyzeResponse(
201
+ entities=entities_response,
202
+ relationships=relationships_response,
203
+ events=events_response,
204
+ stats={
205
+ "total_entities": len(entities_response),
206
+ "total_relationships": len(relationships_response),
207
+ "total_events": len(events_response),
208
+ "created_entities": created_entities,
209
+ "created_relationships": created_relationships,
210
+ "created_events": created_events
211
+ }
212
+ )
213
+
214
+ except Exception as e:
215
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
216
+
217
+
218
+ @router.post("/quick")
219
+ async def quick_analyze(request: AnalyzeRequest):
220
+ """
221
+ Quick analysis without database operations.
222
+ Returns only extracted data without creating anything.
223
+ """
224
+ try:
225
+ result = await entity_extractor.extract(request.text)
226
+
227
+ return {
228
+ "entities": [
229
+ {
230
+ "name": e.name,
231
+ "type": e.type,
232
+ "role": e.role,
233
+ "aliases": e.aliases
234
+ }
235
+ for e in result.entities
236
+ ],
237
+ "relationships": [
238
+ {
239
+ "source": r.source,
240
+ "target": r.target,
241
+ "type": r.relationship_type,
242
+ "context": r.context
243
+ }
244
+ for r in result.relationships
245
+ ],
246
+ "events": [
247
+ {
248
+ "description": ev.description,
249
+ "type": ev.event_type,
250
+ "date": ev.date,
251
+ "participants": ev.participants
252
+ }
253
+ for ev in result.events
254
+ ]
255
+ }
256
+ except Exception as e:
257
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
app/config.py CHANGED
@@ -20,6 +20,9 @@ class Settings(BaseSettings):
20
  # APIs (opcional - pode configurar depois)
21
  newsapi_key: str = ""
22
 
 
 
 
23
  # CORS
24
  cors_origins: list[str] = ["*"]
25
 
 
20
  # APIs (opcional - pode configurar depois)
21
  newsapi_key: str = ""
22
 
23
+ # Cerebras API for LLM-based entity extraction
24
+ cerebras_api_key: str = ""
25
+
26
  # CORS
27
  cors_origins: list[str] = ["*"]
28
 
app/main.py CHANGED
@@ -8,7 +8,7 @@ from contextlib import asynccontextmanager
8
 
9
  from app.config import settings
10
  from app.core.database import init_db
11
- from app.api.routes import entities, relationships, events, search, ingest
12
 
13
 
14
  @asynccontextmanager
@@ -55,6 +55,7 @@ app.include_router(relationships.router, prefix="/api/v1")
55
  app.include_router(events.router, prefix="/api/v1")
56
  app.include_router(search.router, prefix="/api/v1")
57
  app.include_router(ingest.router, prefix="/api/v1")
 
58
 
59
 
60
  @app.get("/")
 
8
 
9
  from app.config import settings
10
  from app.core.database import init_db
11
+ from app.api.routes import entities, relationships, events, search, ingest, analyze
12
 
13
 
14
  @asynccontextmanager
 
55
  app.include_router(events.router, prefix="/api/v1")
56
  app.include_router(search.router, prefix="/api/v1")
57
  app.include_router(ingest.router, prefix="/api/v1")
58
+ app.include_router(analyze.router, prefix="/api/v1")
59
 
60
 
61
  @app.get("/")
app/services/nlp/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # NLP Services
2
+ from .entity_extractor import entity_extractor
app/services/nlp/entity_extractor.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Entity Extractor Service - LLM-based NER
3
+ Uses Cerebras API with Qwen 3 235B for intelligent entity and relationship extraction
4
+ """
5
+ import json
6
+ import re
7
+ from typing import Dict, List, Optional, Any
8
+ from dataclasses import dataclass
9
+ import httpx
10
+
11
+ from app.config import settings
12
+
13
+
14
+ @dataclass
15
+ class ExtractedEntity:
16
+ """Represents an extracted entity"""
17
+ name: str
18
+ type: str # person, organization, location, event
19
+ role: Optional[str] = None
20
+ aliases: Optional[List[str]] = None
21
+ description: Optional[str] = None
22
+ latitude: Optional[float] = None
23
+ longitude: Optional[float] = None
24
+
25
+
26
+ @dataclass
27
+ class ExtractedRelationship:
28
+ """Represents a relationship between entities"""
29
+ source: str
30
+ target: str
31
+ relationship_type: str
32
+ context: Optional[str] = None
33
+
34
+
35
+ @dataclass
36
+ class ExtractedEvent:
37
+ """Represents an extracted event"""
38
+ description: str
39
+ event_type: Optional[str] = None
40
+ date: Optional[str] = None
41
+ location: Optional[str] = None
42
+ participants: Optional[List[str]] = None
43
+
44
+
45
+ @dataclass
46
+ class ExtractionResult:
47
+ """Complete extraction result"""
48
+ entities: List[ExtractedEntity]
49
+ relationships: List[ExtractedRelationship]
50
+ events: List[ExtractedEvent]
51
+ raw_response: Optional[str] = None
52
+
53
+
54
+ EXTRACTION_PROMPT = """Você é um especialista em extração de informações estruturadas de textos.
55
+
56
+ Analise o texto fornecido e extraia TODAS as entidades, relacionamentos e eventos mencionados.
57
+
58
+ ## Regras:
59
+ 1. Identifique entidades: pessoas, organizações, locais, eventos
60
+ 2. Para PESSOAS: inclua nome completo (se mencionado ou conhecido), cargo/função
61
+ 3. Para ORGANIZAÇÕES: inclua nome oficial e siglas
62
+ 4. Para LOCAIS: seja específico (cidade, país, endereço)
63
+ 5. Identifique RELACIONAMENTOS entre entidades (quem trabalha onde, quem conhece quem, etc.)
64
+ 6. Identifique EVENTOS mencionados (reuniões, anúncios, eleições, etc.)
65
+
66
+ ## Formato de resposta (JSON válido):
67
+ ```json
68
+ {
69
+ "entities": [
70
+ {
71
+ "name": "Nome Completo",
72
+ "type": "person|organization|location|event",
73
+ "role": "cargo ou função (opcional)",
74
+ "aliases": ["apelidos", "siglas"],
75
+ "description": "breve descrição se relevante"
76
+ }
77
+ ],
78
+ "relationships": [
79
+ {
80
+ "source": "Nome da Entidade 1",
81
+ "target": "Nome da Entidade 2",
82
+ "relationship_type": "tipo de relação (trabalha em, preside, fundou, reuniu-se com, etc.)",
83
+ "context": "contexto da relação"
84
+ }
85
+ ],
86
+ "events": [
87
+ {
88
+ "description": "O que aconteceu",
89
+ "event_type": "meeting|announcement|election|crime|etc",
90
+ "date": "data se mencionada",
91
+ "location": "local se mencionado",
92
+ "participants": ["lista de participantes"]
93
+ }
94
+ ]
95
+ }
96
+ ```
97
+
98
+ Retorne APENAS o JSON, sem texto adicional.
99
+
100
+ ## Texto para análise:
101
+ {text}
102
+ """
103
+
104
+
105
+ class EntityExtractor:
106
+ """
107
+ LLM-based Entity Extractor using Cerebras API
108
+ """
109
+
110
+ def __init__(self):
111
+ self.api_key = settings.cerebras_api_key
112
+ self.base_url = "https://api.cerebras.ai/v1"
113
+ self.model = "qwen-3-235b-a22b-instruct-2507"
114
+ self.timeout = 60.0
115
+
116
+ async def extract(self, text: str) -> ExtractionResult:
117
+ """
118
+ Extract entities, relationships, and events from text using LLM
119
+
120
+ Args:
121
+ text: The text to analyze
122
+
123
+ Returns:
124
+ ExtractionResult with all extracted information
125
+ """
126
+ if not self.api_key:
127
+ raise ValueError("CEREBRAS_API_KEY not configured")
128
+
129
+ if not text or len(text.strip()) < 10:
130
+ return ExtractionResult(entities=[], relationships=[], events=[])
131
+
132
+ # Prepare the prompt
133
+ prompt = EXTRACTION_PROMPT.format(text=text)
134
+
135
+ # Call Cerebras API
136
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
137
+ response = await client.post(
138
+ f"{self.base_url}/chat/completions",
139
+ headers={
140
+ "Authorization": f"Bearer {self.api_key}",
141
+ "Content-Type": "application/json"
142
+ },
143
+ json={
144
+ "model": self.model,
145
+ "messages": [
146
+ {
147
+ "role": "system",
148
+ "content": "Você é um assistente especialista em extração de entidades e relacionamentos. Sempre responda em JSON válido."
149
+ },
150
+ {
151
+ "role": "user",
152
+ "content": prompt
153
+ }
154
+ ],
155
+ "temperature": 0.1, # Low temperature for consistent extraction
156
+ "max_tokens": 4096
157
+ }
158
+ )
159
+
160
+ response.raise_for_status()
161
+ data = response.json()
162
+
163
+ # Parse the response
164
+ raw_content = data["choices"][0]["message"]["content"]
165
+ return self._parse_response(raw_content)
166
+
167
+ def _parse_response(self, content: str) -> ExtractionResult:
168
+ """Parse the LLM response into structured data"""
169
+ try:
170
+ # Try to extract JSON from the response
171
+ # Sometimes the model wraps it in ```json ... ```
172
+ json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
173
+ if json_match:
174
+ json_str = json_match.group(1)
175
+ else:
176
+ # Try to find raw JSON
177
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
178
+ if json_match:
179
+ json_str = json_match.group(0)
180
+ else:
181
+ json_str = content
182
+
183
+ data = json.loads(json_str)
184
+
185
+ # Parse entities
186
+ entities = []
187
+ for e in data.get("entities", []):
188
+ entities.append(ExtractedEntity(
189
+ name=e.get("name", ""),
190
+ type=e.get("type", "unknown"),
191
+ role=e.get("role"),
192
+ aliases=e.get("aliases", []),
193
+ description=e.get("description")
194
+ ))
195
+
196
+ # Parse relationships
197
+ relationships = []
198
+ for r in data.get("relationships", []):
199
+ relationships.append(ExtractedRelationship(
200
+ source=r.get("source", ""),
201
+ target=r.get("target", ""),
202
+ relationship_type=r.get("relationship_type", "related_to"),
203
+ context=r.get("context")
204
+ ))
205
+
206
+ # Parse events
207
+ events = []
208
+ for ev in data.get("events", []):
209
+ events.append(ExtractedEvent(
210
+ description=ev.get("description", ""),
211
+ event_type=ev.get("event_type"),
212
+ date=ev.get("date"),
213
+ location=ev.get("location"),
214
+ participants=ev.get("participants", [])
215
+ ))
216
+
217
+ return ExtractionResult(
218
+ entities=entities,
219
+ relationships=relationships,
220
+ events=events,
221
+ raw_response=content
222
+ )
223
+
224
+ except json.JSONDecodeError as e:
225
+ print(f"Failed to parse LLM response: {e}")
226
+ print(f"Raw content: {content}")
227
+ return ExtractionResult(
228
+ entities=[],
229
+ relationships=[],
230
+ events=[],
231
+ raw_response=content
232
+ )
233
+
234
+ def extract_sync(self, text: str) -> ExtractionResult:
235
+ """
236
+ Synchronous version of extract for non-async contexts
237
+ """
238
+ import asyncio
239
+ return asyncio.run(self.extract(text))
240
+
241
+
242
+ # Singleton instance
243
+ entity_extractor = EntityExtractor()
requirements.txt CHANGED
@@ -9,3 +9,4 @@ httpx==0.25.2
9
  python-multipart==0.0.6
10
  aiohttp==3.9.1
11
  feedparser==6.0.10
 
 
9
  python-multipart==0.0.6
10
  aiohttp==3.9.1
11
  feedparser==6.0.10
12
+ # httpx already included - used for Cerebras API calls