from sentence_transformers import SentenceTransformer from langchain.text_splitter import RecursiveCharacterTextSplitter from pypdf import PdfReader import requests import json import os import time def extract_text_from_pdf(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text.strip() def chunk_text(text, chunk_size=500, chunk_overlap=100): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, # Overlap to preserve context separators=["\n\n", "\n", " ", ""], # Prioritize logical breaks ) return splitter.split_text(text) embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") def embedding_function(texts): return embedding_model.encode(texts, convert_to_numpy=True).tolist() def generate_hypothetical_answer(query): import requests import json import os import time # Hugging Face API endpoint with vLLM api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions" # Get API token from environment variable api_token = os.getenv("HUGGINGFACE_API_TOKEN") if not api_token: print("Error: HUGGINGFACE_API_TOKEN environment variable not set") return "Error: HUGGINGFACE_API_TOKEN environment variable not set" # Headers for the API request headers = { "Authorization": f"Bearer {api_token}", "Content-Type": "application/json" } # Create a prompt for generating a hypothetical answer prompt = f""" Given the following query, generate a hypothetical answer that might be found in a document: Query: {query} Hypothetical answer: """ # Prepare the request payload for vLLM payload = { "inputs": prompt, "parameters": { "max_new_tokens": 256, "temperature": 0.7, "top_p": 0.95, "do_sample": True, "use_vllm": True # Enable vLLM for faster inference } } try: # Make the API request to Hugging Face print("Sending request to Hugging Face API with vLLM for hypothetical answer...") print(f"API URL: {api_url}") print(f"Headers: {headers}") print(f"Payload: {json.dumps(payload, indent=2)}") start_time = time.time() # Set a longer timeout (5 minutes) response = requests.post(api_url, headers=headers, json=payload, timeout=300) end_time = time.time() print(f"Received hypothetical answer from Hugging Face API in {end_time - start_time:.2f} seconds") print(f"Response status code: {response.status_code}") print(f"Response headers: {response.headers}") # Try to print the response content for debugging try: print(f"Response content: {response.text[:1000]}...") # Print first 1000 chars except: print("Could not print response content") response.raise_for_status() # Raise an exception for HTTP errors # Parse the response result = response.json() print(f"Parsed response: {json.dumps(result, indent=2)[:1000]}...") # Print first 1000 chars # Extract the generated text if isinstance(result, list) and len(result) > 0: generated_text = result[0].get("generated_text", "") else: generated_text = result.get("generated_text", "") return generated_text.strip() except requests.exceptions.Timeout: print("Request to Hugging Face API timed out after 5 minutes") return "The request timed out. The model is taking too long to respond. Please try again with a simpler query." except requests.exceptions.ConnectionError: print("Could not connect to Hugging Face API") return "Could not connect to the Hugging Face API. Please check your internet connection." except requests.exceptions.HTTPError as e: print(f"HTTP error occurred: {e}") print(f"Response status code: {e.response.status_code}") print(f"Response headers: {e.response.headers}") try: print(f"Response content: {e.response.text}") except: print("Could not print response content") if e.response.status_code == 401: return "Authentication error. Please check your Hugging Face API token." elif e.response.status_code == 429: return "Rate limit exceeded. Please try again later." return f"HTTP error occurred: {e}" except Exception as e: print(f"Error generating hypothetical answer: {e}") import traceback print(f"Traceback: {traceback.format_exc()}") return "Failed to generate a hypothetical answer." def query_llm_with_context(query, context, top_n=3): import requests import json import os import time # Unpack the context tuple documents, similarity_scores = context # Use only the top N documents top_docs = documents[:top_n] # Create a context string by joining the top documents context_text = "\n\n===Document Boundary===\n\n".join(top_docs) # Create a prompt with the context and query prompt = f""" Context information is below. --------------------- {context_text} --------------------- Given the context information and not prior knowledge, answer the following query: Query: {query} """ # Hugging Face API endpoint with vLLM api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions" # Get API token from environment variable api_token = os.getenv("HUGGINGFACE_API_TOKEN") if not api_token: print("Error: HUGGINGFACE_API_TOKEN environment variable not set") return "Error: HUGGINGFACE_API_TOKEN environment variable not set" # Headers for the API request headers = { "Authorization": f"Bearer {api_token}", "Content-Type": "application/json" } # Prepare the request payload for vLLM payload = { "inputs": prompt, "parameters": { "max_new_tokens": 512, "temperature": 0.7, "top_p": 0.95, "do_sample": True, "use_vllm": True # Enable vLLM for faster inference } } try: # Make the API request to Hugging Face print("Sending request to Hugging Face API with vLLM...") print(f"API URL: {api_url}") print(f"Headers: {headers}") print(f"Payload: {json.dumps(payload, indent=2)}") start_time = time.time() # Set a longer timeout (5 minutes) response = requests.post(api_url, headers=headers, json=payload, timeout=300) end_time = time.time() print(f"Received response from Hugging Face API in {end_time - start_time:.2f} seconds") print(f"Response status code: {response.status_code}") print(f"Response headers: {response.headers}") # Try to print the response content for debugging try: print(f"Response content: {response.text[:1000]}...") # Print first 1000 chars except: print("Could not print response content") response.raise_for_status() # Raise an exception for HTTP errors # Parse the response result = response.json() print(f"Parsed response: {json.dumps(result, indent=2)[:1000]}...") # Print first 1000 chars # Extract the generated text if isinstance(result, list) and len(result) > 0: generated_text = result[0].get("generated_text", "") else: generated_text = result.get("generated_text", "") return generated_text.strip() except requests.exceptions.Timeout: print("Request to Hugging Face API timed out after 5 minutes") return "The request timed out. The model is taking too long to respond. Please try again with a simpler query or fewer context documents." except requests.exceptions.ConnectionError: print("Could not connect to Hugging Face API") return "Could not connect to the Hugging Face API. Please check your internet connection." except requests.exceptions.HTTPError as e: print(f"HTTP error occurred: {e}") print(f"Response status code: {e.response.status_code}") print(f"Response headers: {e.response.headers}") try: print(f"Response content: {e.response.text}") except: print("Could not print response content") if e.response.status_code == 401: return "Authentication error. Please check your Hugging Face API token." elif e.response.status_code == 429: return "Rate limit exceeded. Please try again later." return f"HTTP error occurred: {e}" except Exception as e: print(f"Error querying LLM with context: {e}") import traceback print(f"Traceback: {traceback.format_exc()}") return "Failed to generate an answer with the provided context."