Spaces:

heisbuba
/

quantvat

Running

App Files Files Community

heisbuba commited on Jan 31

Commit

cade15b

verified ·

1 Parent(s): 346305b

Upload analysis.py

Browse files

Files changed (1) hide show

src/services/analysis.py +21 -8

src/services/analysis.py CHANGED Viewed

@@ -27,7 +27,7 @@ ORIGINAL_HTML_STYLE = """
 ORIGINAL_MATCHED_HEADERS = ["Ticker", "Spot MrktCap", "Spot Volume", "Spot VTMR", "Futures Volume", "Futures VTMR", "OISS", "Funding Rate"]
 ORIGINAL_FUTURES_HEADERS = ["Ticker", "Market Cap", "Volume", "VTMR", "OISS", "Funding Rate"]
-ORIGINAL_SPOT_HEADERS = ["Ticker", "Market Cap", "Volume", "Spot VTMR"]
 class FileScanner:
     """Locates the latest Spot and Futures data files in the USER directory."""
@@ -79,10 +79,11 @@ class DataProcessor:
     def load_spot(path: Path) -> pd.DataFrame:
         print(f"   Parsing Spot File: {path.name}")
         try:
             if path.suffix == '.html':
-                df = pd.read_html(path)[0]
             else:
-                df = pd.read_csv(path)
             df.columns = [c.lower().replace(' ', '_') for c in df.columns]
             col_map = {
@@ -106,7 +107,8 @@ class DataProcessor:
                         break
             if 'ticker' in df.columns:
-                df['ticker'] = df['ticker'].apply(lambda x: re.sub(r'[^A-Z0-9]', '', str(x).upper()))
             print(f"   Extracted {len(df)} spot tokens")
             return df
         except Exception as e:
@@ -123,6 +125,7 @@ class DataProcessor:
             df_display[m] = ""
         df_display = df_display[df_cols]
         df_display.columns = headers
         table_html = df_display.to_html(index=False, classes='table', escape=False)
         return f'<div class="table-container"><h2>{title}</h2>{table_html}</div>'
@@ -144,7 +147,7 @@ class DataProcessor:
             print(f"   Futures high-quality filtering error: {e}")
             valid_futures['vtmr_display'] = valid_futures['vtmr']
-        # Create the 3 main datasets: Overlap, Futures-Only, Spot-Only
         merged = pd.merge(spot_df, valid_futures, on='ticker', how='inner', suffixes=('_spot', '_fut'))
         if 'vtmr_fut' in merged.columns:
             merged = merged.sort_values('vtmr_fut', ascending=False)
@@ -254,13 +257,23 @@ def crypto_analysis_v4(user_keys, user_id) -> None:
     # 2. Parse Files
     futures_df = PDFParser.extract(futures_file)
     spot_df = DataProcessor.load_spot(spot_file)
-    # 3. Generate HTML
     html_content = DataProcessor.generate_html_report(futures_df, spot_df)
     if html_content:
-        # 4. Create PDF
         pdf_path = convert_html_to_pdf(html_content, user_id)
         print("   🧹 Cleaning up source files after analysis...")

 ORIGINAL_MATCHED_HEADERS = ["Ticker", "Spot MrktCap", "Spot Volume", "Spot VTMR", "Futures Volume", "Futures VTMR", "OISS", "Funding Rate"]
 ORIGINAL_FUTURES_HEADERS = ["Ticker", "Market Cap", "Volume", "VTMR", "OISS", "Funding Rate"]
+ORIGINAL_SPOT_HEADERS = ["Ticker", "MarketCap", "Volume", "VTMR"]
 class FileScanner:
     """Locates the latest Spot and Futures data files in the USER directory."""
     def load_spot(path: Path) -> pd.DataFrame:
         print(f"   Parsing Spot File: {path.name}")
         try:
+            # Explicit UTF-8 for Unicode preservation
             if path.suffix == '.html':
+                df = pd.read_html(str(path), encoding='utf-8')[0]
             else:
+                df = pd.read_csv(path, encoding='utf-8')
             df.columns = [c.lower().replace(' ', '_') for c in df.columns]
             col_map = {
                         break
             if 'ticker' in df.columns:
+                # Unicode-safe cleaning to protect Chinese characters
+                df['ticker'] = df['ticker'].apply(lambda x: str(x).strip().upper())
             print(f"   Extracted {len(df)} spot tokens")
             return df
         except Exception as e:
             df_display[m] = ""
         df_display = df_display[df_cols]
         df_display.columns = headers
+        # escape=False is critical for rendering ticker links
         table_html = df_display.to_html(index=False, classes='table', escape=False)
         return f'<div class="table-container"><h2>{title}</h2>{table_html}</div>'
             print(f"   Futures high-quality filtering error: {e}")
             valid_futures['vtmr_display'] = valid_futures['vtmr']
+        # Suffix-based merge to prevent blank column mapping issues
         merged = pd.merge(spot_df, valid_futures, on='ticker', how='inner', suffixes=('_spot', '_fut'))
         if 'vtmr_fut' in merged.columns:
             merged = merged.sort_values('vtmr_fut', ascending=False)
     # 2. Parse Files
     futures_df = PDFParser.extract(futures_file)
+    # FIX: "First Uppercase Ticker" logic to prevent labeling shift
+    if not futures_df.empty and 'ticker' in futures_df.columns:
+        def refine_ticker(x):
+            text = str(x)
+            # Find all words that are entirely uppercase (2+ chars)
+            matches = re.findall(r'\b[A-Z]{2,}\b', text)
+            # Use the first match; fallback to original stripped value if none (e.g. Chinese)
+            return matches[0] if matches else text.strip().upper()
+        futures_df['ticker'] = futures_df['ticker'].apply(refine_ticker)
     spot_df = DataProcessor.load_spot(spot_file)
     html_content = DataProcessor.generate_html_report(futures_df, spot_df)
     if html_content:
+        # Create PDF
         pdf_path = convert_html_to_pdf(html_content, user_id)
         print("   🧹 Cleaning up source files after analysis...")