heisbuba commited on
Commit
1c263aa
·
verified ·
1 Parent(s): 916d54b

Upload futures_engine.py

Browse files
Files changed (1) hide show
  1. src/services/futures_engine.py +130 -92
src/services/futures_engine.py CHANGED
@@ -3,12 +3,10 @@ import pandas as pd
3
  from dataclasses import dataclass
4
  from typing import List, Optional, Tuple
5
 
6
- # Integrated Docling for high-accuracy layout reconstruction
7
  try:
8
- from docling.document_converter import DocumentConverter
9
- DOCLING_AVAILABLE = True
10
- except ImportError:
11
- DOCLING_AVAILABLE = False
12
 
13
  @dataclass
14
  class TokenData:
@@ -21,18 +19,22 @@ class TokenData:
21
  oiss: str = "-"
22
 
23
  class PDFParser:
24
- """Handles 100% accuracy extraction of futures data from web-printed PDFs."""
25
 
26
- # Financial cleaning pattern for raw strings
27
- CLEAN_VAL = re.compile(r'[\$\,\%\s]')
 
 
 
 
 
28
 
29
- # Keywords to filter out website UI elements repeated by Chrome Print
30
  IGNORE_KEYWORDS = {
31
- 'page', 'coinalyze', 'contract', 'filter', 'column', 'coins',
32
- 'mkt cap', 'vol 24h', 'vtmr', 'all contracts', 'custom metrics', 'watchlist'
33
  }
34
 
35
- # --- Signal Helpers (Maintained 100% Original Logic) ---
36
 
37
  @staticmethod
38
  def _oi_score_and_signal(oi_change: float) -> Tuple[int, str]:
@@ -53,105 +55,141 @@ class PDFParser:
53
 
54
  @classmethod
55
  def make_oiss(cls, oi_percent_str: str) -> str:
56
- if not oi_percent_str or str(oi_percent_str).strip() in ['-', 'N/A']: return "-"
57
- val = cls.CLEAN_VAL.sub("", str(oi_percent_str))
58
  try:
59
  oi_change = float(val) / 100
60
- _, signal = cls._oi_score_and_signal(oi_change)
61
- css_class = "oi-strong" if oi_change > 0 else "oi-weak" if oi_change < 0 else ""
 
 
 
 
62
  sign = "+" if oi_change > 0 else ""
63
  if css_class:
64
- return f'<span class="{css_class}">{sign}{oi_change*100:.1f}%</span> {signal}'
65
- return f"{sign}{oi_change*100:.1f}% {signal}"
66
- except: return "-"
 
67
 
68
  @classmethod
69
  def make_funding_signal(cls, funding_str: str) -> str:
70
- if not funding_str or str(funding_str).strip() in ['-', 'N/A']: return "-"
71
  try:
72
- val = float(cls.CLEAN_VAL.sub("", str(funding_str)))
73
  signal_word, css_class = cls._funding_score_and_signal(val)
 
74
  if css_class:
75
- return f'<span class="{css_class}">{val}%</span> <span style="font-size:0.8em;">{signal_word}</span>'
76
  return f'{val}% {signal_word}'
77
- except: return str(funding_str)
 
78
 
79
- # --- Robust AI Extraction Logic ---
80
 
81
  @classmethod
82
- def extract(cls, path: str) -> pd.DataFrame:
83
- print(f" [!] AI Engine: Processing {path}")
84
- if not DOCLING_AVAILABLE:
85
- print(" [X] Deployment Error: 'docling' library is not installed.")
86
  return pd.DataFrame()
87
-
88
  try:
89
- # DocumentConverter uses layout models to "re-build" the grid Chrome destroyed
90
- converter = DocumentConverter()
91
- result = converter.convert(str(path))
92
- all_tokens: List[TokenData] = []
93
-
94
- for element, _ in result.document.iterate_items():
95
- if hasattr(element, "data") and hasattr(element.data, "table"):
96
- # High-accuracy dataframe reconstruction
97
- df_raw = element.export_to_dataframe()
98
- all_tokens.extend(cls._process_rows(df_raw))
99
-
100
- if not all_tokens:
101
- print(" [!] Warning: No token data found in PDF.")
102
  return pd.DataFrame()
103
-
104
- final_df = pd.DataFrame([vars(t) for t in all_tokens])
105
-
106
- # Clean ticker duplicates (e.g., if a token repeats across page breaks)
107
- final_df = final_df.drop_duplicates(subset=['ticker'], keep='first')
108
-
109
- print(f" [+] Successfully extracted {len(final_df)} tokens.")
110
- return final_df
111
-
112
  except Exception as e:
113
- print(f" [X] Critical Engine Failure: {e}")
114
  return pd.DataFrame()
115
 
116
  @classmethod
117
- def _process_rows(cls, df: pd.DataFrame) -> List[TokenData]:
118
- tokens = []
119
- for _, row in df.iterrows():
120
- try:
121
- # 1. Clean and validate the 'COIN' block
122
- coin_cell = str(row[0]).strip()
123
- if any(k in coin_cell.lower() for k in cls.IGNORE_KEYWORDS) or len(coin_cell) < 3:
124
- continue
125
-
126
- # Chrome stacks Name and Ticker with \n. Split and filter empty parts.
127
- parts = [p.strip() for p in coin_cell.split('\n') if p.strip()]
128
- if not parts: continue
129
-
130
- name = parts[0]
131
- ticker_raw = parts[1] if len(parts) > 1 else name
132
- ticker = re.sub(r'[^A-Z0-9]', '', ticker_raw.upper())
133
-
134
- # 2. Extract and Sanitize Columns based on Coinalyze Layout
135
- # Mapping: 0:Coin, 1:Mkt Cap, 2:Vol 24h, 3:OI Chg, 4:PFR, 5:VTMR
136
- mkt_cap = str(row[1]).strip()
137
- volume = str(row[2]).strip()
138
-
139
- # Robust VTMR cleaning (handles Chrome artifacts and empty fields)
140
- vtmr_raw = cls.CLEAN_VAL.sub("", str(row[5])) if len(row) > 5 else ""
141
  try:
142
- vtmr_float = float(vtmr_raw) if vtmr_raw else 0.0
143
- except ValueError:
144
- vtmr_float = 0.0
145
-
146
- tokens.append(TokenData(
147
- ticker=ticker,
148
- name=name,
149
- market_cap=mkt_cap,
150
- volume=volume,
151
- vtmr=vtmr_float,
152
- funding=cls.make_funding_signal(row[4]),
153
- oiss=cls.make_oiss(row[3])
154
- ))
155
- except Exception:
156
- continue # Skip individual malformed rows to ensure 100% completion
157
- return tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from dataclasses import dataclass
4
  from typing import List, Optional, Tuple
5
 
 
6
  try:
7
+ import pypdf
8
+ except Exception:
9
+ pypdf = None
 
10
 
11
  @dataclass
12
  class TokenData:
 
19
  oiss: str = "-"
20
 
21
  class PDFParser:
22
+ """Handles extraction of tabular data from Coinalyze PDFs using regex."""
23
 
24
+ FINANCIAL_PATTERN = re.compile(
25
+ r'(\$?[+-]?[\d,\.]+[kKmMbB]?)\s+'
26
+ r'(\$?[+-]?[\d,\.]+[kKmMbB]?)\s+'
27
+ r'(?:([+\-]?[\d\.\,]+\%?|[\-\–\—]|N\/A)\s+)?'
28
+ r'(?:([+\-]?[\d\.\,]+\%?|[\-\–\—]|N\/A)\s+)?'
29
+ r'(\d*\.?\d+)'
30
+ )
31
 
 
32
  IGNORE_KEYWORDS = {
33
+ 'page', 'coinalyze', 'contract', 'filter', 'column',
34
+ 'mkt cap', 'vol 24h', 'vtmr', 'coins', 'all contracts', 'custom metrics', 'watchlists'
35
  }
36
 
37
+ # --- Signal Helpers (Moved inside to keep logic self-contained) ---
38
 
39
  @staticmethod
40
  def _oi_score_and_signal(oi_change: float) -> Tuple[int, str]:
 
55
 
56
  @classmethod
57
  def make_oiss(cls, oi_percent_str: str) -> str:
58
+ if not oi_percent_str: return "-"
59
+ val = oi_percent_str.replace("%", "").strip()
60
  try:
61
  oi_change = float(val) / 100
62
+ score, signal = cls._oi_score_and_signal(oi_change)
63
+
64
+ if oi_change > 0: css_class = "oi-strong"
65
+ elif oi_change < 0: css_class = "oi-weak"
66
+ else: css_class = ""
67
+
68
  sign = "+" if oi_change > 0 else ""
69
  if css_class:
70
+ return f'<span class="{css_class}">{sign}{oi_change*100:.0f}%</span> {signal}'
71
+ return f"{sign}{oi_change*100:.0f}% {signal}"
72
+ except Exception:
73
+ return "-"
74
 
75
  @classmethod
76
  def make_funding_signal(cls, funding_str: str) -> str:
77
+ if not funding_str or funding_str in ['-', 'N/A']: return "-"
78
  try:
79
+ val = float(funding_str.replace('%', '').strip())
80
  signal_word, css_class = cls._funding_score_and_signal(val)
81
+
82
  if css_class:
83
+ return f'<span class="{css_class}">{val}%</span> <span style="font-size:0.8em; color:#7f8c8d;">{signal_word}</span>'
84
  return f'{val}% {signal_word}'
85
+ except Exception:
86
+ return funding_str
87
 
88
+ # --- Core Extraction Logic ---
89
 
90
  @classmethod
91
+ def extract(cls, path) -> pd.DataFrame:
92
+ print(f" Parsing Futures PDF: {path.name}")
93
+ if pypdf is None:
94
+ print(" pypdf not available - PDF parsing disabled.")
95
  return pd.DataFrame()
96
+ data: List[TokenData] = []
97
  try:
98
+ reader = pypdf.PdfReader(path)
99
+ for page in reader.pages:
100
+ raw = page.extract_text() or ""
101
+ lines = [ln.strip() for ln in raw.split("\n") if ln.strip()]
102
+ page_data = cls._parse_page_smart(lines)
103
+ data.extend(page_data)
104
+ print(f" Extracted {len(data)} futures tokens")
105
+ if not data:
 
 
 
 
 
106
  return pd.DataFrame()
107
+ df = pd.DataFrame([vars(t) for t in data])
108
+ df['ticker'] = df['ticker'].apply(lambda x: re.sub(r'[^A-Z0-9]', '', str(x).upper()))
109
+ df = df[df['ticker'].str.len() > 1]
110
+ print(f" Valid futures tokens: {len(df)}")
111
+ return df
 
 
 
 
112
  except Exception as e:
113
+ print(f" PDF Error: {e}")
114
  return pd.DataFrame()
115
 
116
  @classmethod
117
+ def _parse_page_smart(cls, lines: List[str]) -> List[TokenData]:
118
+ financials = []
119
+ raw_text_lines = []
120
+
121
+ for line in lines:
122
+ if any(k in line.lower() for k in cls.IGNORE_KEYWORDS):
123
+ continue
124
+
125
+ fin_match = cls.FINANCIAL_PATTERN.search(line)
126
+ if fin_match:
127
+ groups = fin_match.groups()
128
+ mc = groups[0].replace('$', '').replace(',', '')
129
+ vol = groups[1].replace('$', '').replace(',', '')
130
+ oi_str = groups[2]
131
+ fund_str = groups[3]
132
+ vtmr = groups[4]
 
 
 
 
 
 
 
 
133
  try:
134
+ float(vtmr)
135
+ financials.append((mc, vol, vtmr, oi_str, fund_str))
136
+ except:
137
+ raw_text_lines.append(line)
138
+ else:
139
+ if not line.isdigit() and len(line) > 1:
140
+ raw_text_lines.append(line)
141
+
142
+ token_pairs = []
143
+ i = 0
144
+ while i < len(raw_text_lines):
145
+ line = raw_text_lines[i]
146
+ clean_current = cls._clean_ticker_strict(line)
147
+
148
+ if clean_current:
149
+ if i + 1 < len(raw_text_lines):
150
+ next_line = raw_text_lines[i + 1]
151
+ clean_next = cls._clean_ticker_strict(next_line)
152
+ if clean_next:
153
+ token_pairs.append((line, clean_next))
154
+ i += 2
155
+ continue
156
+
157
+ if i + 1 < len(raw_text_lines):
158
+ name_candidate = raw_text_lines[i]
159
+ ticker_candidate_raw = raw_text_lines[i + 1]
160
+ ticker = cls._clean_ticker_strict(ticker_candidate_raw)
161
+ if ticker:
162
+ token_pairs.append((name_candidate, ticker))
163
+ i += 2
164
+ else:
165
+ i += 1
166
+ else:
167
+ i += 1
168
+
169
+ tokens: List[TokenData] = []
170
+ limit = min(len(token_pairs), len(financials))
171
+
172
+ for k in range(limit):
173
+ name, ticker = token_pairs[k]
174
+ mc, vol, vtmr, oi_pct, fund_pct = financials[k]
175
+
176
+ oiss_val = cls.make_oiss(oi_pct) if oi_pct and oi_pct not in ['-', 'N/A'] else "-"
177
+ funding_val = cls.make_funding_signal(fund_pct)
178
+
179
+ tokens.append(TokenData(
180
+ ticker=ticker,
181
+ name=name,
182
+ market_cap=mc,
183
+ volume=vol,
184
+ vtmr=float(vtmr),
185
+ funding=funding_val,
186
+ oiss=oiss_val
187
+ ))
188
+ return tokens
189
+
190
+ @staticmethod
191
+ def _clean_ticker_strict(text: str) -> Optional[str]:
192
+ if len(text) > 15: return None
193
+ cleaned = re.sub(r'[^A-Z0-9]', '', text.upper())
194
+ if 2 <= len(cleaned) <= 12: return cleaned
195
+ return None