agent01 / src /crawler /models.ts
Auto Deployer
Deploy compliance agent services
f39c319
export enum ErrorCode {
SOURCE_MISSING = 4001,
INVALID_URL = 4002,
PARSE_FAILED = 4003,
NO_CONTENT = 4004,
CRAWL_FAILED = 5001,
DIFF_FAILED = 5002,
DB_SAVE_FAILED = 5003,
EXPORT_STRUCT_FAILED = 5004
}
export type SourceType = 'peer_bank' | 'regulator' | 'sdk_vendor';
export type CrawlFrequency = '4h' | 'daily' | 'weekly';
export type Priority = 'high' | 'medium' | 'low';
export interface SourceRegistry {
source_id: string;
source_name: string;
source_type: SourceType;
domain: string;
entry_url: string;
url_pattern: string;
parser_type: string;
crawl_frequency: CrawlFrequency;
priority: Priority;
enabled: boolean;
topic_tags: string[];
created_at?: string;
updated_at?: string;
}
export type TriggerType = 'schedule' | 'manual' | 'webhook';
export type JobStatus = 'queued' | 'running' | 'success' | 'failed';
export interface CrawlJob {
job_id: string;
source_id: string;
trigger_type: TriggerType;
status: JobStatus;
started_at?: string;
ended_at?: string;
error_code?: string;
error_message?: string;
retry_count: number;
}
export interface RawSnapshot {
snapshot_id: string;
source_id: string;
job_id: string;
fetched_at: string;
content_type: string;
raw_body: string;
raw_hash: string;
http_status: number;
final_url: string;
}
export type DocStatus = 'active' | 'archived';
export interface NormalizedDocument {
doc_id: string;
source_id: string;
snapshot_id: string;
title: string;
version_date: string;
effective_date: string;
normalized_text: string;
normalized_hash: string;
doc_status: DocStatus;
created_at?: string;
}
export type EmbeddingStatus = 'pending' | 'ready' | 'failed';
export interface ClauseChunk {
chunk_id: string;
doc_id: string;
section_path: string;
section_title: string;
clause_text: string;
topic_tags: string[];
embedding_status: EmbeddingStatus;
chunk_order: number;
created_at?: string;
}
export type ChangeType = 'added' | 'removed' | 'modified' | 'unchanged';
export type ImpactLevel = 'high' | 'medium' | 'low';
export interface DiffEvent {
event_id: string;
source_id: string;
from_doc_id: string;
to_doc_id: string;
change_type: ChangeType;
section_title: string;
old_excerpt: string;
new_excerpt: string;
topic_tags: string[];
impact_level: ImpactLevel;
detected_at: string;
}
// API Requests / Responses
export interface CreateSourceRequest {
source_name: string;
source_type: SourceType;
domain: string;
entry_url: string;
url_pattern: string;
parser_type: string;
crawl_frequency: CrawlFrequency;
priority: Priority;
enabled: boolean;
topic_tags: string[];
}
export interface CreateSourceResponse {
source_id: string;
success: boolean;
}
export interface GetSourcesQuery {
source_type?: SourceType;
enabled?: boolean;
priority?: Priority;
}
export interface GetSourcesResponse {
items: SourceRegistry[];
total: number;
}
export interface CreateJobsRequest {
source_ids: string[];
trigger_type: TriggerType;
}
export interface CreateJobsResponse {
job_ids: string[];
status: string;
}
export interface GetJobResponse {
job_id: string;
source_id: string;
status: JobStatus;
started_at?: string;
ended_at?: string;
error_code?: string | null;
retry_count: number;
}