Spaces:
Running
Running
| # from pathlib import Path | |
| # import pickle | |
| # from sentence_transformers import SentenceTransformer | |
| # BASE_DIR = Path(__file__).resolve().parent.parent | |
| # pickle_file = BASE_DIR / "vector_store" / "analyze_embeddings.pkl" | |
| # instruction_file = BASE_DIR / "instructions" / "analyze.txt" | |
| # def embed_analyze_instructions(): | |
| # instruction_file = BASE_DIR / "instructions" / "analyze.txt" | |
| # pickle_file = BASE_DIR / "vector_store" / "analyze_embeddings.pkl" | |
| # # Ensure directory exists | |
| # pickle_file.parent.mkdir(parents=True, exist_ok=True) | |
| # # If embeddings already exist, load | |
| # if pickle_file.exists(): | |
| # with open(pickle_file, "rb") as f: | |
| # data = pickle.load(f) | |
| # # print("Analyze embeddings already exist. Loaded from disk.") | |
| # return data | |
| # # Load instructions | |
| # with open(instruction_file, "r", encoding="utf-8") as f: | |
| # instructions = [line.strip() for line in f if line.strip()] | |
| # # Embed | |
| # model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # embeddings = model.encode(instructions) | |
| # pickle_file.parent.mkdir(parents=True, exist_ok=True) | |
| # # Save | |
| # data = {"instructions": instructions, "embeddings": embeddings} | |
| # with open(pickle_file, "wb") as f: | |
| # pickle.dump(data, f) | |
| # print(f"Instruction embeddings created and saved: {len(instructions)} instructions") | |
| # return data | |
| # if __name__ == "__main__": | |
| # embed_analyze_instructions() | |
| import os | |
| import pickle | |
| import requests | |
| from pathlib import Path | |
| from utils.logger import logger | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| PICKLE_FILE = BASE_DIR / "vector_store" / "analyze_embeddings.pkl" | |
| INSTRUCTION_FILE = BASE_DIR / "instructions" / "analyze.txt" | |
| def get_ollama_embeddings(texts): | |
| """Try to get embeddings from local Ollama service.""" | |
| try: | |
| # Default Ollama address | |
| url = "http://localhost:11434/api/embed" | |
| # Note: Some Ollama versions use /api/embeddings (plural) | |
| embeddings = [] | |
| for text in texts: | |
| response = requests.post( | |
| url, | |
| json={"model": "mxbai-embed-large", "input": text}, | |
| timeout=5 | |
| ) | |
| embeddings.append(response.json()['embeddings'][0]) | |
| return embeddings | |
| except Exception: | |
| return None | |
| def get_hf_api_embeddings(texts): | |
| """Try to get embeddings via Hugging Face Inference API.""" | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| return None | |
| api_url = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2" | |
| headers = {"Authorization": f"Bearer {token}"} | |
| try: | |
| response = requests.post(api_url, headers=headers, json={"inputs": texts}, timeout=10) | |
| return response.json() | |
| except Exception: | |
| return None | |
| def embed_analyze_instructions(): | |
| # 1. Ensure directory exists | |
| PICKLE_FILE.parent.mkdir(parents=True, exist_ok=True) | |
| # 2. Check if cached embeddings exist | |
| if PICKLE_FILE.exists(): | |
| with open(PICKLE_FILE, "rb") as f: | |
| return pickle.load(f) | |
| # 3. Load instructions from file | |
| if not INSTRUCTION_FILE.exists(): | |
| logger.error(f"Instruction file not found at {INSTRUCTION_FILE}") | |
| return None | |
| with open(INSTRUCTION_FILE, "r", encoding="utf-8") as f: | |
| instructions = [line.strip() for line in f if line.strip()] | |
| embeddings = None | |
| # --- FALLBACK LOGIC --- | |
| # Try Ollama First | |
| logger.info("Attempting Ollama embeddings...") | |
| embeddings = get_ollama_embeddings(instructions) | |
| # Try HF API Second | |
| if embeddings is None: | |
| logger.info("Ollama failed. Attempting Hugging Face API...") | |
| embeddings = get_hf_api_embeddings(instructions) | |
| # Local Heavy Fallback Third | |
| if embeddings is None: | |
| logger.warning("External APIs failed. Loading heavy local SentenceTransformer...") | |
| # Lazy import: Only loads Torch/Transformers if absolutely necessary | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| embeddings = model.encode(instructions) | |
| # 4. Save and Return | |
| if embeddings is not None: | |
| data = {"instructions": instructions, "embeddings": embeddings} | |
| with open(PICKLE_FILE, "wb") as f: | |
| pickle.dump(data, f) | |
| logger.info(f"Embeddings saved: {len(instructions)} instructions") | |
| return data | |
| logger.error("Failed to generate embeddings via any method.") | |
| return None | |
| if __name__ == "__main__": | |
| embed_analyze_instructions() | |