newFinderAgent_v2 / src /cache.py
chummchumm's picture
Upload 6 files
8b425b2 verified
import gspread
from google.oauth2.service_account import Credentials
import streamlit as st
import os
import json
# --- 1. SETUP GOOGLE SHEETS CONNECTION ---
def connect_to_sheet(sheet_name):
scopes = [
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive"
]
# --- STRATEGY 1: LOCAL FILE (Priority) ---
if os.path.exists("credentials.json"):
print("πŸ“‚ Using local 'credentials.json' file.")
creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
# --- STRATEGY 2: ENVIRONMENT VARIABLE (Fallback for Deploy) ---
else:
print("☁️ 'credentials.json' not found. Checking Environment Variables...")
creds_json_str = os.environ.get("gcp_service_account")
if not creds_json_str:
raise ValueError(
"❌ Error: Could not find 'credentials.json' LOCALLY, and 'gcp_service_account' is missing from ENV vars."
)
creds_dict = json.loads(creds_json_str)
creds = Credentials.from_service_account_info(creds_dict, scopes=scopes)
# Authorize & Open
client = gspread.authorize(creds)
return client.open(sheet_name).sheet1
# def connect_to_sheet(sheet_name):
# # Define the scopes (Permissions)
# scopes = [
# "https://www.googleapis.com/auth/spreadsheets",
# "https://www.googleapis.com/auth/drive"
# ]
#
# creds_json_str = os.environ.get("gcp_service_account")
#
# if not creds_json_str:
# raise ValueError(
# "❌ Error: Could not find 'gcp_service_account' in Environment Variables. Did you add the secret in Hugging Face settings?")
#
# # 2. Convert the String back into a Python Dictionary
# creds_dict = json.loads(creds_json_str)
#
# # 3. Create credentials object
# creds = Credentials.from_service_account_info(
# creds_dict,
# scopes=scopes
# )
#
# # Authorize gspread
# client = gspread.authorize(creds)
#
# # Open the sheet
# return client.open(sheet_name).sheet1
def load_cache_dict(sheet):
"""Returns a dict: {'company name': 'url'} for fast lookup."""
try:
data = sheet.get_all_records()
# Create dict: lowercased name -> url
return {row['Company'].lower().strip(): row['Website'] for row in data if row['Company']}
except Exception as e:
print(f"⚠️ Cache read error (empty sheet?): {e}")
return {}
def append_to_cache(sheet, new_entries):
"""Appends list of {'Company': name, 'Website': url} to sheet."""
if not new_entries:
return
rows = [[entry['Company'], entry['Website']] for entry in new_entries]
try:
sheet.append_rows(rows)
print(f"πŸ’Ύ Cached {len(rows)} new companies.")
except Exception as e:
print(f"❌ Error saving to cache: {e}")
# # --- 2. LOAD CACHE (READ URLS FROM SHEET) ---
# def load_cache(sheet):
# """
# Reads the entire sheet and creates a dictionary:
# {'Nvidia': 'https://nvidia.com', 'Tesla': 'https://tesla.com'}
# """
# print("πŸ“‚ Reading existing data from Google Sheet...")
# data = sheet.get_all_records() # Assumes headers: "Company", "Website"
#
# # Create a quick lookup dictionary (Normalize names to lowercase to be safe)
# cache = {row['Company'].lower().strip(): row['Website'] for row in data if row['Company']}
# return cache
#
#
# # --- 3. THE "CASH SAVER" FUNCTION ---
# def save_companies_to_cache(new_rows_to_add, sheet_name):
# sheet = connect_to_sheet(sheet_name)
#
# # 1. Check if there is data
# if new_rows_to_add:
# print(f"πŸ’Ύ Saving {len(new_rows_to_add)} new companies to Sheet...")
#
# # 2. CONVERT Dicts to Lists
# values_to_upload = [
# [item.get('company_name'), item.get('company_website')]
# for item in new_rows_to_add
# ]
#
# # 3. Append to Sheet
# sheet.append_rows(values_to_upload)
# print("βœ… Save Complete.")
#
# else:
# print("πŸŽ‰ No new searches needed. Sheet is up to date!")
#
#
# # --- 3. THE LOAD CACHED COMPANIES FUNCTION ---
# def get_cached_companies(company_list, sheet_name):
# """
# Splits companies into 'Found in Cache' and 'Missing (Need to Search)'
# """
# sheet = connect_to_sheet(sheet_name)
# cache = load_cache(sheet)
#
# companies_in_cache = []
# missing_companies = []
#
# print(f"πŸ“‚ Checking Cache for {len(company_list)} companies...")
# # print(company_list)
#
# for item in company_list:
# name = item # item['company_name']
# # Normalize key for matching (must match how you save them)
# name_key = name.lower().strip()
#
# # === CHECK CACHE ===
# # We check if key exists AND value is not empty
# if name_key in cache and cache[name_key]:
# # print(f" βœ… Cache Hit: {name}") # Optional: Comment out to reduce noise
# companies_in_cache.append({
# 'company_name': name,
# 'company_website': cache[name_key]
# })
# # === MISSING ===
# else:
# # print(f" πŸ”Ž Cache Miss: {name}")
# missing_companies.append(item)
#
# # RETURN OUTSIDE THE LOOP
# return {
# 'found': companies_in_cache,
# 'missing': missing_companies
# }