Spaces:
Running
Running
| """ | |
| SQLAlchemy ORM models for CopperMind. | |
| Tables: | |
| - NewsArticle: Raw news articles with dedup | |
| - PriceBar: OHLCV price data per symbol/date | |
| - NewsSentiment: FinBERT scores per article | |
| - DailySentiment: Aggregated daily sentiment index | |
| - AnalysisSnapshot: Cached analysis reports | |
| """ | |
| from datetime import datetime, timezone | |
| from typing import Optional | |
| def _utcnow() -> datetime: | |
| """Timezone-aware UTC now, replacing deprecated datetime.utcnow().""" | |
| return datetime.now(timezone.utc) | |
| from sqlalchemy import ( | |
| Column, | |
| Integer, | |
| BigInteger, | |
| String, | |
| Float, | |
| DateTime, | |
| Text, | |
| Boolean, | |
| ForeignKey, | |
| Index, | |
| LargeBinary, | |
| UniqueConstraint, | |
| JSON, | |
| func, | |
| ) | |
| from sqlalchemy.dialects.postgresql import UUID, JSONB | |
| from sqlalchemy.orm import relationship | |
| from app.db import Base | |
| class NewsArticle(Base): | |
| """ | |
| Raw news articles collected from various sources. | |
| Dedup key prevents duplicate articles. | |
| """ | |
| __tablename__ = "news_articles" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| # Deduplication | |
| dedup_key = Column(String(64), unique=True, nullable=False, index=True) | |
| # Content | |
| title = Column(String(500), nullable=False) | |
| canonical_title = Column(String(500), nullable=True, index=True) # For fuzzy dedup | |
| description = Column(Text, nullable=True) | |
| content = Column(Text, nullable=True) | |
| url = Column(String(2000), nullable=True) | |
| # Metadata | |
| source = Column(String(200), nullable=True) | |
| author = Column(String(200), nullable=True) | |
| language = Column(String(10), nullable=True, default="en") | |
| # Timestamps | |
| published_at = Column(DateTime(timezone=True), nullable=False, index=True) | |
| fetched_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow) | |
| # Relationships | |
| sentiment = relationship("NewsSentiment", back_populates="article", uselist=False) | |
| def __repr__(self): | |
| return f"<NewsArticle(id={self.id}, title='{self.title[:30]}...')>" | |
| class PriceBar(Base): | |
| """ | |
| Daily OHLCV price data for tracked symbols. | |
| Unique constraint on (symbol, date) prevents duplicates. | |
| """ | |
| __tablename__ = "price_bars" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| symbol = Column(String(20), nullable=False, index=True) | |
| date = Column(DateTime(timezone=True), nullable=False, index=True) | |
| # OHLCV | |
| open = Column(Float, nullable=True) | |
| high = Column(Float, nullable=True) | |
| low = Column(Float, nullable=True) | |
| close = Column(Float, nullable=False) | |
| volume = Column(Float, nullable=True) | |
| # Adjusted close (for splits/dividends) | |
| adj_close = Column(Float, nullable=True) | |
| # When this record was fetched | |
| fetched_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow) | |
| __table_args__ = ( | |
| UniqueConstraint("symbol", "date", name="uq_price_symbol_date"), | |
| Index("ix_price_symbol_date", "symbol", "date"), | |
| ) | |
| def __repr__(self): | |
| return f"<PriceBar(symbol={self.symbol}, date={self.date}, close={self.close})>" | |
| class NewsSentiment(Base): | |
| """ | |
| Sentiment scores for each news article. | |
| Primary: LLM (OpenRouter structured outputs) with copper-specific context | |
| Fallback: FinBERT for generic financial sentiment | |
| One-to-one relationship with NewsArticle. | |
| """ | |
| __tablename__ = "news_sentiments" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| news_article_id = Column( | |
| Integer, | |
| ForeignKey("news_articles.id", ondelete="CASCADE"), | |
| unique=True, | |
| nullable=False, | |
| index=True | |
| ) | |
| # Sentiment probabilities (LLM derives these from score) | |
| prob_positive = Column(Float, nullable=False) | |
| prob_neutral = Column(Float, nullable=False) | |
| prob_negative = Column(Float, nullable=False) | |
| # Sentiment score: -1 (bearish) to +1 (bullish) | |
| score = Column(Float, nullable=False, index=True) | |
| # LLM reasoning for the score (debug + future UI display) | |
| reasoning = Column(Text, nullable=True) | |
| # Model info (LLM model or "ProsusAI/finbert" for fallback) | |
| model_name = Column(String(100), default="google/gemini-2.0-flash-exp:free") | |
| # When scored | |
| scored_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow) | |
| # Relationship | |
| article = relationship("NewsArticle", back_populates="sentiment") | |
| def __repr__(self): | |
| return f"<NewsSentiment(article_id={self.news_article_id}, score={self.score:.3f})>" | |
| class DailySentiment(Base): | |
| """ | |
| Aggregated daily sentiment index. | |
| One row per date with weighted average sentiment. | |
| """ | |
| __tablename__ = "daily_sentiments" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| date = Column(DateTime(timezone=True), nullable=False, unique=True, index=True) | |
| # Aggregated sentiment | |
| sentiment_index = Column(Float, nullable=False) | |
| # Statistics | |
| news_count = Column(Integer, nullable=False, default=0) | |
| avg_positive = Column(Float, nullable=True) | |
| avg_neutral = Column(Float, nullable=True) | |
| avg_negative = Column(Float, nullable=True) | |
| # Weighting method used | |
| weighting_method = Column(String(50), default="recency_exponential") | |
| # When aggregated | |
| aggregated_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow) | |
| def __repr__(self): | |
| return f"<DailySentiment(date={self.date}, index={self.sentiment_index:.3f}, news={self.news_count})>" | |
| class AnalysisSnapshot(Base): | |
| """ | |
| Cached analysis reports for API responses. | |
| Enables TTL-based caching and stable responses during pipeline runs. | |
| """ | |
| __tablename__ = "analysis_snapshots" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| symbol = Column(String(20), nullable=False, index=True) | |
| as_of_date = Column(DateTime(timezone=True), nullable=False) | |
| # Full analysis report as JSON | |
| report_json = Column(JSON, nullable=False) | |
| # When this snapshot was generated | |
| generated_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow, index=True) | |
| # Model version used | |
| model_version = Column(String(100), nullable=True) | |
| __table_args__ = ( | |
| UniqueConstraint("symbol", "as_of_date", name="uq_snapshot_symbol_date"), | |
| Index("ix_snapshot_symbol_generated", "symbol", "generated_at"), | |
| ) | |
| def __repr__(self): | |
| return f"<AnalysisSnapshot(symbol={self.symbol}, as_of={self.as_of_date})>" | |
| class AICommentary(Base): | |
| """ | |
| Cached AI commentary generated after pipeline runs. | |
| One row per symbol, updated after each pipeline execution. | |
| """ | |
| __tablename__ = "ai_commentaries" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| symbol = Column(String(20), nullable=False, unique=True, index=True) | |
| # The generated commentary text | |
| commentary = Column(Text, nullable=False) | |
| # Input data used to generate (for debugging) | |
| current_price = Column(Float, nullable=True) | |
| predicted_price = Column(Float, nullable=True) | |
| predicted_return = Column(Float, nullable=True) | |
| sentiment_label = Column(String(20), nullable=True) | |
| # AI-determined market stance (BULLISH/NEUTRAL/BEARISH) | |
| # Generated by having LLM analyze its own commentary | |
| ai_stance = Column(String(20), nullable=True, default="NEUTRAL") | |
| # When generated | |
| generated_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow, index=True) | |
| # Model used | |
| model_name = Column(String(100), nullable=True) | |
| def __repr__(self): | |
| return f"<AICommentary(symbol={self.symbol}, generated_at={self.generated_at})>" | |
| class ModelMetadata(Base): | |
| """ | |
| Persisted XGBoost model metadata. | |
| Stores feature importance, features list, and metrics in database | |
| so they survive HF Space restarts. | |
| One row per symbol, updated after each model training (train_model=True). | |
| """ | |
| __tablename__ = "model_metadata" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| symbol = Column(String(20), nullable=False, unique=True, index=True) | |
| # Feature importance as JSON [{feature, importance}, ...] | |
| importance_json = Column(Text, nullable=True) | |
| # Feature names list as JSON ["feature1", "feature2", ...] | |
| features_json = Column(Text, nullable=True) | |
| # Training metrics as JSON {train_mae, val_mae, etc} | |
| metrics_json = Column(Text, nullable=True) | |
| # When the model was trained | |
| trained_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow, index=True) | |
| def __repr__(self): | |
| return f"<ModelMetadata(symbol={self.symbol}, trained_at={self.trained_at})>" | |
| class PipelineRunMetrics(Base): | |
| """ | |
| Metrics captured after each pipeline run for monitoring. | |
| Enables tracking of: | |
| - Symbol fetch success/failure rates | |
| - Model training metrics over time | |
| - Pipeline duration trends | |
| - Data quality indicators | |
| """ | |
| __tablename__ = "pipeline_run_metrics" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| # Run identification | |
| run_id = Column(String(64), nullable=False, unique=True, index=True) | |
| run_started_at = Column(DateTime(timezone=True), nullable=False, index=True) | |
| run_completed_at = Column(DateTime(timezone=True), nullable=True) | |
| # Duration | |
| duration_seconds = Column(Float, nullable=True) | |
| # Symbol set info | |
| symbol_set_name = Column(String(50), nullable=True) # active/champion/challenger | |
| symbols_requested = Column(Integer, nullable=True) | |
| symbols_fetched_ok = Column(Integer, nullable=True) | |
| symbols_failed = Column(Integer, nullable=True) | |
| failed_symbols_list = Column(Text, nullable=True) # JSON array | |
| # Training metrics | |
| train_mae = Column(Float, nullable=True) | |
| val_mae = Column(Float, nullable=True) | |
| train_rmse = Column(Float, nullable=True) | |
| val_rmse = Column(Float, nullable=True) | |
| feature_count = Column(Integer, nullable=True) | |
| train_samples = Column(Integer, nullable=True) | |
| val_samples = Column(Integer, nullable=True) | |
| # Data quality (legacy - news_articles table) | |
| news_imported = Column(Integer, nullable=True) | |
| news_duplicates = Column(Integer, nullable=True) | |
| price_bars_updated = Column(Integer, nullable=True) | |
| missing_price_days = Column(Integer, nullable=True) | |
| # Faz 2: Reproducible news pipeline stats | |
| news_raw_inserted = Column(Integer, nullable=True) | |
| news_raw_duplicates = Column(Integer, nullable=True) | |
| news_processed_inserted = Column(Integer, nullable=True) | |
| news_processed_duplicates = Column(Integer, nullable=True) | |
| articles_scored_v2 = Column(Integer, nullable=True) | |
| llm_parse_fail_count = Column(Integer, nullable=True) | |
| escalation_count = Column(Integer, nullable=True) | |
| fallback_count = Column(Integer, nullable=True) | |
| # Snapshot info | |
| snapshot_generated = Column(Boolean, default=False) | |
| commentary_generated = Column(Boolean, default=False) | |
| # TFT-ASRO deep learning pipeline stats | |
| tft_embeddings_computed = Column(Integer, nullable=True) | |
| tft_trained = Column(Boolean, default=False) | |
| tft_val_loss = Column(Float, nullable=True) | |
| tft_sharpe = Column(Float, nullable=True) | |
| tft_directional_accuracy = Column(Float, nullable=True) | |
| tft_snapshot_generated = Column(Boolean, default=False) | |
| # Faz 2: News cut-off time | |
| news_cutoff_time = Column(DateTime(timezone=True), nullable=True) | |
| # Quality state for degraded runs | |
| quality_state = Column(String(20), nullable=True, default="ok") # ok/stale/degraded/failed | |
| # Status | |
| status = Column(String(20), nullable=False, default="running") # running/success/failed | |
| error_message = Column(Text, nullable=True) | |
| def __repr__(self): | |
| return f"<PipelineRunMetrics(run_id={self.run_id}, status={self.status})>" | |
| # ============================================================================= | |
| # Faz 2: Reproducible News Pipeline | |
| # ============================================================================= | |
| class NewsRaw(Base): | |
| """ | |
| Ham haber verisi - RSS/API'den geldiği gibi saklanır. | |
| Faz 2: Reproducibility için "golden source". | |
| Dedup stratejisi: | |
| - url_hash: nullable + partial unique index (WHERE url_hash IS NOT NULL) | |
| - URL eksikse title-based fallback processed seviyesinde yapılır | |
| """ | |
| __tablename__ = "news_raw" | |
| id = Column(BigInteger, primary_key=True, autoincrement=True) | |
| # URL (nullable - RSS'te eksik olabilir) | |
| url = Column(String(2000), nullable=True) | |
| url_hash = Column(String(64), nullable=True, index=True) # sha256, partial unique | |
| # Content | |
| title = Column(String(500), nullable=False) | |
| description = Column(Text, nullable=True) | |
| # Metadata | |
| source = Column(String(200), nullable=True) # "google_news", "newsapi" | |
| source_feed = Column(String(500), nullable=True) # Exact RSS URL or query | |
| published_at = Column(DateTime(timezone=True), nullable=False, index=True) | |
| fetched_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) | |
| # Pipeline run tracking (UUID) | |
| run_id = Column(UUID(as_uuid=True), nullable=True, index=True) | |
| # Raw payload (debug/audit) | |
| raw_payload = Column(JSONB, nullable=True) | |
| # Relationship | |
| processed_items = relationship("NewsProcessed", back_populates="raw") | |
| def __repr__(self): | |
| return f"<NewsRaw(id={self.id}, title='{self.title[:30]}...')>" | |
| class NewsProcessed(Base): | |
| """ | |
| İşlenmiş haber - dedup, cleaning, language filter sonrası. | |
| Faz 2: Sentiment scoring için input. | |
| Dedup stratejisi: | |
| - dedup_key: NOT NULL + UNIQUE - asıl dedup otoritesi | |
| - Öncelik: url_hash varsa kullan, yoksa sha256(source + canonical_title_hash) | |
| """ | |
| __tablename__ = "news_processed" | |
| id = Column(BigInteger, primary_key=True, autoincrement=True) | |
| # FK to raw (RESTRICT - raw silinirse processed da silinmemeli) | |
| raw_id = Column( | |
| BigInteger, | |
| ForeignKey("news_raw.id", ondelete="RESTRICT"), | |
| nullable=False, | |
| index=True | |
| ) | |
| # Canonical content | |
| canonical_title = Column(String(500), nullable=False) | |
| canonical_title_hash = Column(String(64), nullable=False, index=True) # sha256 | |
| cleaned_text = Column(Text, nullable=True) # title + description, cleaned | |
| # Dedup key - ASIL OTORİTE | |
| dedup_key = Column(String(64), unique=True, nullable=False, index=True) # sha256 | |
| # Language | |
| language = Column(String(10), nullable=True, default="en") | |
| language_confidence = Column(Float, nullable=True) | |
| # Processing metadata | |
| processed_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) | |
| run_id = Column(UUID(as_uuid=True), nullable=True, index=True) | |
| # Future: Tone/Impact scores (Faz 3) | |
| # tone_score = Column(Float, nullable=True) | |
| # impact_direction = Column(String(20), nullable=True) # bullish/bearish/neutral | |
| # Relationship | |
| raw = relationship("NewsRaw", back_populates="processed_items") | |
| sentiment_v2_items = relationship("NewsSentimentV2", back_populates="processed") | |
| def __repr__(self): | |
| return f"<NewsProcessed(id={self.id}, dedup_key='{self.dedup_key[:16]}...')>" | |
| class NewsSentimentV2(Base): | |
| """ | |
| Commodity-aware sentiment scores generated from news_processed records. | |
| """ | |
| __tablename__ = "news_sentiments_v2" | |
| id = Column(BigInteger, primary_key=True, autoincrement=True) | |
| news_processed_id = Column( | |
| BigInteger, | |
| ForeignKey("news_processed.id", ondelete="CASCADE"), | |
| nullable=False, | |
| index=True, | |
| ) | |
| horizon_days = Column(Integer, nullable=False, default=5) | |
| label = Column(String(20), nullable=False, index=True) | |
| impact_score_llm = Column(Float, nullable=False) | |
| confidence_llm = Column(Float, nullable=False) | |
| confidence_calibrated = Column(Float, nullable=False, index=True) | |
| relevance_score = Column(Float, nullable=False, index=True) | |
| event_type = Column(String(50), nullable=False, index=True) | |
| rule_sign = Column(Integer, nullable=False) | |
| final_score = Column(Float, nullable=False, index=True) | |
| finbert_pos = Column(Float, nullable=False) | |
| finbert_neu = Column(Float, nullable=False) | |
| finbert_neg = Column(Float, nullable=False) | |
| reasoning_json = Column(Text, nullable=True) | |
| model_fast = Column(String(100), nullable=True) | |
| model_reliable = Column(String(100), nullable=True) | |
| scored_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow, index=True) | |
| processed = relationship("NewsProcessed", back_populates="sentiment_v2_items") | |
| __table_args__ = ( | |
| UniqueConstraint("news_processed_id", "horizon_days", name="uq_news_sentiments_v2_processed_horizon"), | |
| Index("ix_news_sentiments_v2_processed_scored", "news_processed_id", "scored_at"), | |
| ) | |
| def __repr__(self): | |
| return ( | |
| "<NewsSentimentV2(processed_id=" | |
| f"{self.news_processed_id}, horizon_days={self.horizon_days}, final_score={self.final_score:.3f})>" | |
| ) | |
| class DailySentimentV2(Base): | |
| """ | |
| Daily aggregate sentiment generated from NewsSentimentV2. | |
| """ | |
| __tablename__ = "daily_sentiments_v2" | |
| id = Column(BigInteger, primary_key=True, autoincrement=True) | |
| date = Column(DateTime(timezone=True), nullable=False, unique=True, index=True) | |
| sentiment_index = Column(Float, nullable=False, index=True) | |
| news_count = Column(Integer, nullable=False, default=0) | |
| avg_confidence = Column(Float, nullable=True) | |
| avg_relevance = Column(Float, nullable=True) | |
| source_version = Column(String(20), nullable=False, default="v2") | |
| aggregated_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow, index=True) | |
| def __repr__(self): | |
| return ( | |
| "<DailySentimentV2(date=" | |
| f"{self.date}, sentiment_index={self.sentiment_index:.3f}, news_count={self.news_count})>" | |
| ) | |
| # ============================================================================= | |
| # TFT-ASRO: Deep Learning Pipeline Tables | |
| # ============================================================================= | |
| class NewsEmbedding(Base): | |
| """ | |
| FinBERT CLS token embeddings for news articles. | |
| Stores both the full 768-dim vector and PCA-reduced representation | |
| used by the Temporal Fusion Transformer. | |
| """ | |
| __tablename__ = "news_embeddings" | |
| id = Column(BigInteger, primary_key=True, autoincrement=True) | |
| news_processed_id = Column( | |
| BigInteger, | |
| ForeignKey("news_processed.id", ondelete="CASCADE"), | |
| unique=True, | |
| nullable=False, | |
| index=True, | |
| ) | |
| embedding_full = Column(LargeBinary, nullable=True) | |
| embedding_pca = Column(LargeBinary, nullable=False) | |
| pca_version = Column(String(20), nullable=False) | |
| created_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow) | |
| processed = relationship("NewsProcessed") | |
| def __repr__(self): | |
| return f"<NewsEmbedding(processed_id={self.news_processed_id}, pca={self.pca_version})>" | |
| class LMEWarehouseData(Base): | |
| """ | |
| LME copper warehouse stock data: total stocks, cancelled warrants, | |
| and derived ratios used as physical-market features for the TFT. | |
| """ | |
| __tablename__ = "lme_warehouse_data" | |
| id = Column(BigInteger, primary_key=True, autoincrement=True) | |
| date = Column(DateTime(timezone=True), unique=True, nullable=False, index=True) | |
| total_stock_tonnes = Column(Float, nullable=False) | |
| cancelled_warrants_tonnes = Column(Float, nullable=True) | |
| on_warrant_tonnes = Column(Float, nullable=True) | |
| cancelled_ratio = Column(Float, nullable=True) | |
| fetched_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow) | |
| def __repr__(self): | |
| return f"<LMEWarehouseData(date={self.date}, stock={self.total_stock_tonnes})>" | |
| class TFTModelMetadata(Base): | |
| """ | |
| Persisted TFT-ASRO model metadata (parallel to XGBoost ModelMetadata). | |
| """ | |
| __tablename__ = "tft_model_metadata" | |
| id = Column(Integer, primary_key=True, autoincrement=True) | |
| symbol = Column(String(20), nullable=False, unique=True, index=True) | |
| config_json = Column(Text, nullable=True) | |
| metrics_json = Column(Text, nullable=True) | |
| checkpoint_path = Column(String(500), nullable=True) | |
| trained_at = Column(DateTime(timezone=True), nullable=False, default=_utcnow, index=True) | |
| def __repr__(self): | |
| return f"<TFTModelMetadata(symbol={self.symbol}, trained_at={self.trained_at})>" | |