heisbuba commited on
Commit
cade15b
·
verified ·
1 Parent(s): 346305b

Upload analysis.py

Browse files
Files changed (1) hide show
  1. src/services/analysis.py +21 -8
src/services/analysis.py CHANGED
@@ -27,7 +27,7 @@ ORIGINAL_HTML_STYLE = """
27
 
28
  ORIGINAL_MATCHED_HEADERS = ["Ticker", "Spot MrktCap", "Spot Volume", "Spot VTMR", "Futures Volume", "Futures VTMR", "OISS", "Funding Rate"]
29
  ORIGINAL_FUTURES_HEADERS = ["Ticker", "Market Cap", "Volume", "VTMR", "OISS", "Funding Rate"]
30
- ORIGINAL_SPOT_HEADERS = ["Ticker", "Market Cap", "Volume", "Spot VTMR"]
31
 
32
  class FileScanner:
33
  """Locates the latest Spot and Futures data files in the USER directory."""
@@ -79,10 +79,11 @@ class DataProcessor:
79
  def load_spot(path: Path) -> pd.DataFrame:
80
  print(f" Parsing Spot File: {path.name}")
81
  try:
 
82
  if path.suffix == '.html':
83
- df = pd.read_html(path)[0]
84
  else:
85
- df = pd.read_csv(path)
86
  df.columns = [c.lower().replace(' ', '_') for c in df.columns]
87
 
88
  col_map = {
@@ -106,7 +107,8 @@ class DataProcessor:
106
  break
107
 
108
  if 'ticker' in df.columns:
109
- df['ticker'] = df['ticker'].apply(lambda x: re.sub(r'[^A-Z0-9]', '', str(x).upper()))
 
110
  print(f" Extracted {len(df)} spot tokens")
111
  return df
112
  except Exception as e:
@@ -123,6 +125,7 @@ class DataProcessor:
123
  df_display[m] = ""
124
  df_display = df_display[df_cols]
125
  df_display.columns = headers
 
126
  table_html = df_display.to_html(index=False, classes='table', escape=False)
127
  return f'<div class="table-container"><h2>{title}</h2>{table_html}</div>'
128
 
@@ -144,7 +147,7 @@ class DataProcessor:
144
  print(f" Futures high-quality filtering error: {e}")
145
  valid_futures['vtmr_display'] = valid_futures['vtmr']
146
 
147
- # Create the 3 main datasets: Overlap, Futures-Only, Spot-Only
148
  merged = pd.merge(spot_df, valid_futures, on='ticker', how='inner', suffixes=('_spot', '_fut'))
149
  if 'vtmr_fut' in merged.columns:
150
  merged = merged.sort_values('vtmr_fut', ascending=False)
@@ -254,13 +257,23 @@ def crypto_analysis_v4(user_keys, user_id) -> None:
254
 
255
  # 2. Parse Files
256
  futures_df = PDFParser.extract(futures_file)
 
 
 
 
 
 
 
 
 
 
 
 
257
  spot_df = DataProcessor.load_spot(spot_file)
258
 
259
- # 3. Generate HTML
260
  html_content = DataProcessor.generate_html_report(futures_df, spot_df)
261
-
262
  if html_content:
263
- # 4. Create PDF
264
  pdf_path = convert_html_to_pdf(html_content, user_id)
265
 
266
  print(" 🧹 Cleaning up source files after analysis...")
 
27
 
28
  ORIGINAL_MATCHED_HEADERS = ["Ticker", "Spot MrktCap", "Spot Volume", "Spot VTMR", "Futures Volume", "Futures VTMR", "OISS", "Funding Rate"]
29
  ORIGINAL_FUTURES_HEADERS = ["Ticker", "Market Cap", "Volume", "VTMR", "OISS", "Funding Rate"]
30
+ ORIGINAL_SPOT_HEADERS = ["Ticker", "MarketCap", "Volume", "VTMR"]
31
 
32
  class FileScanner:
33
  """Locates the latest Spot and Futures data files in the USER directory."""
 
79
  def load_spot(path: Path) -> pd.DataFrame:
80
  print(f" Parsing Spot File: {path.name}")
81
  try:
82
+ # Explicit UTF-8 for Unicode preservation
83
  if path.suffix == '.html':
84
+ df = pd.read_html(str(path), encoding='utf-8')[0]
85
  else:
86
+ df = pd.read_csv(path, encoding='utf-8')
87
  df.columns = [c.lower().replace(' ', '_') for c in df.columns]
88
 
89
  col_map = {
 
107
  break
108
 
109
  if 'ticker' in df.columns:
110
+ # Unicode-safe cleaning to protect Chinese characters
111
+ df['ticker'] = df['ticker'].apply(lambda x: str(x).strip().upper())
112
  print(f" Extracted {len(df)} spot tokens")
113
  return df
114
  except Exception as e:
 
125
  df_display[m] = ""
126
  df_display = df_display[df_cols]
127
  df_display.columns = headers
128
+ # escape=False is critical for rendering ticker links
129
  table_html = df_display.to_html(index=False, classes='table', escape=False)
130
  return f'<div class="table-container"><h2>{title}</h2>{table_html}</div>'
131
 
 
147
  print(f" Futures high-quality filtering error: {e}")
148
  valid_futures['vtmr_display'] = valid_futures['vtmr']
149
 
150
+ # Suffix-based merge to prevent blank column mapping issues
151
  merged = pd.merge(spot_df, valid_futures, on='ticker', how='inner', suffixes=('_spot', '_fut'))
152
  if 'vtmr_fut' in merged.columns:
153
  merged = merged.sort_values('vtmr_fut', ascending=False)
 
257
 
258
  # 2. Parse Files
259
  futures_df = PDFParser.extract(futures_file)
260
+
261
+ # FIX: "First Uppercase Ticker" logic to prevent labeling shift
262
+ if not futures_df.empty and 'ticker' in futures_df.columns:
263
+ def refine_ticker(x):
264
+ text = str(x)
265
+ # Find all words that are entirely uppercase (2+ chars)
266
+ matches = re.findall(r'\b[A-Z]{2,}\b', text)
267
+ # Use the first match; fallback to original stripped value if none (e.g. Chinese)
268
+ return matches[0] if matches else text.strip().upper()
269
+
270
+ futures_df['ticker'] = futures_df['ticker'].apply(refine_ticker)
271
+
272
  spot_df = DataProcessor.load_spot(spot_file)
273
 
 
274
  html_content = DataProcessor.generate_html_report(futures_df, spot_df)
 
275
  if html_content:
276
+ # Create PDF
277
  pdf_path = convert_html_to_pdf(html_content, user_id)
278
 
279
  print(" 🧹 Cleaning up source files after analysis...")