Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from pypdf import PdfReader | |
| import requests | |
| import json | |
| import os | |
| import time | |
| def extract_text_from_pdf(pdf_path): | |
| reader = PdfReader(pdf_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| def chunk_text(text, chunk_size=500, chunk_overlap=100): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, # Overlap to preserve context | |
| separators=["\n\n", "\n", " ", ""], # Prioritize logical breaks | |
| ) | |
| return splitter.split_text(text) | |
| embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def embedding_function(texts): | |
| return embedding_model.encode(texts, convert_to_numpy=True).tolist() | |
| def generate_hypothetical_answer(query): | |
| import requests | |
| import json | |
| import os | |
| import time | |
| # Hugging Face API endpoint with vLLM | |
| api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions" | |
| # Get API token from environment variable | |
| api_token = os.getenv("HUGGINGFACE_API_TOKEN") | |
| if not api_token: | |
| print("Error: HUGGINGFACE_API_TOKEN environment variable not set") | |
| return "Error: HUGGINGFACE_API_TOKEN environment variable not set" | |
| # Headers for the API request | |
| headers = { | |
| "Authorization": f"Bearer {api_token}", | |
| "Content-Type": "application/json" | |
| } | |
| # Create a prompt for generating a hypothetical answer | |
| prompt = f""" | |
| Given the following query, generate a hypothetical answer that might be found in a document: | |
| Query: {query} | |
| Hypothetical answer: | |
| """ | |
| # Prepare the request payload for vLLM | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": 256, | |
| "temperature": 0.7, | |
| "top_p": 0.95, | |
| "do_sample": True, | |
| "use_vllm": True # Enable vLLM for faster inference | |
| } | |
| } | |
| try: | |
| # Make the API request to Hugging Face | |
| print("Sending request to Hugging Face API with vLLM for hypothetical answer...") | |
| print(f"API URL: {api_url}") | |
| print(f"Headers: {headers}") | |
| print(f"Payload: {json.dumps(payload, indent=2)}") | |
| start_time = time.time() | |
| # Set a longer timeout (5 minutes) | |
| response = requests.post(api_url, headers=headers, json=payload, timeout=300) | |
| end_time = time.time() | |
| print(f"Received hypothetical answer from Hugging Face API in {end_time - start_time:.2f} seconds") | |
| print(f"Response status code: {response.status_code}") | |
| print(f"Response headers: {response.headers}") | |
| # Try to print the response content for debugging | |
| try: | |
| print(f"Response content: {response.text[:1000]}...") # Print first 1000 chars | |
| except: | |
| print("Could not print response content") | |
| response.raise_for_status() # Raise an exception for HTTP errors | |
| # Parse the response | |
| result = response.json() | |
| print(f"Parsed response: {json.dumps(result, indent=2)[:1000]}...") # Print first 1000 chars | |
| # Extract the generated text | |
| if isinstance(result, list) and len(result) > 0: | |
| generated_text = result[0].get("generated_text", "") | |
| else: | |
| generated_text = result.get("generated_text", "") | |
| return generated_text.strip() | |
| except requests.exceptions.Timeout: | |
| print("Request to Hugging Face API timed out after 5 minutes") | |
| return "The request timed out. The model is taking too long to respond. Please try again with a simpler query." | |
| except requests.exceptions.ConnectionError: | |
| print("Could not connect to Hugging Face API") | |
| return "Could not connect to the Hugging Face API. Please check your internet connection." | |
| except requests.exceptions.HTTPError as e: | |
| print(f"HTTP error occurred: {e}") | |
| print(f"Response status code: {e.response.status_code}") | |
| print(f"Response headers: {e.response.headers}") | |
| try: | |
| print(f"Response content: {e.response.text}") | |
| except: | |
| print("Could not print response content") | |
| if e.response.status_code == 401: | |
| return "Authentication error. Please check your Hugging Face API token." | |
| elif e.response.status_code == 429: | |
| return "Rate limit exceeded. Please try again later." | |
| return f"HTTP error occurred: {e}" | |
| except Exception as e: | |
| print(f"Error generating hypothetical answer: {e}") | |
| import traceback | |
| print(f"Traceback: {traceback.format_exc()}") | |
| return "Failed to generate a hypothetical answer." | |
| def query_llm_with_context(query, context, top_n=3): | |
| import requests | |
| import json | |
| import os | |
| import time | |
| # Unpack the context tuple | |
| documents, similarity_scores = context | |
| # Use only the top N documents | |
| top_docs = documents[:top_n] | |
| # Create a context string by joining the top documents | |
| context_text = "\n\n===Document Boundary===\n\n".join(top_docs) | |
| # Create a prompt with the context and query | |
| prompt = f""" | |
| Context information is below. | |
| --------------------- | |
| {context_text} | |
| --------------------- | |
| Given the context information and not prior knowledge, answer the following query: | |
| Query: {query} | |
| """ | |
| # Hugging Face API endpoint with vLLM | |
| api_url = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-2-7b-chat-hf/v1/chat/completions" | |
| # Get API token from environment variable | |
| api_token = os.getenv("HUGGINGFACE_API_TOKEN") | |
| if not api_token: | |
| print("Error: HUGGINGFACE_API_TOKEN environment variable not set") | |
| return "Error: HUGGINGFACE_API_TOKEN environment variable not set" | |
| # Headers for the API request | |
| headers = { | |
| "Authorization": f"Bearer {api_token}", | |
| "Content-Type": "application/json" | |
| } | |
| # Prepare the request payload for vLLM | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": 512, | |
| "temperature": 0.7, | |
| "top_p": 0.95, | |
| "do_sample": True, | |
| "use_vllm": True # Enable vLLM for faster inference | |
| } | |
| } | |
| try: | |
| # Make the API request to Hugging Face | |
| print("Sending request to Hugging Face API with vLLM...") | |
| print(f"API URL: {api_url}") | |
| print(f"Headers: {headers}") | |
| print(f"Payload: {json.dumps(payload, indent=2)}") | |
| start_time = time.time() | |
| # Set a longer timeout (5 minutes) | |
| response = requests.post(api_url, headers=headers, json=payload, timeout=300) | |
| end_time = time.time() | |
| print(f"Received response from Hugging Face API in {end_time - start_time:.2f} seconds") | |
| print(f"Response status code: {response.status_code}") | |
| print(f"Response headers: {response.headers}") | |
| # Try to print the response content for debugging | |
| try: | |
| print(f"Response content: {response.text[:1000]}...") # Print first 1000 chars | |
| except: | |
| print("Could not print response content") | |
| response.raise_for_status() # Raise an exception for HTTP errors | |
| # Parse the response | |
| result = response.json() | |
| print(f"Parsed response: {json.dumps(result, indent=2)[:1000]}...") # Print first 1000 chars | |
| # Extract the generated text | |
| if isinstance(result, list) and len(result) > 0: | |
| generated_text = result[0].get("generated_text", "") | |
| else: | |
| generated_text = result.get("generated_text", "") | |
| return generated_text.strip() | |
| except requests.exceptions.Timeout: | |
| print("Request to Hugging Face API timed out after 5 minutes") | |
| return "The request timed out. The model is taking too long to respond. Please try again with a simpler query or fewer context documents." | |
| except requests.exceptions.ConnectionError: | |
| print("Could not connect to Hugging Face API") | |
| return "Could not connect to the Hugging Face API. Please check your internet connection." | |
| except requests.exceptions.HTTPError as e: | |
| print(f"HTTP error occurred: {e}") | |
| print(f"Response status code: {e.response.status_code}") | |
| print(f"Response headers: {e.response.headers}") | |
| try: | |
| print(f"Response content: {e.response.text}") | |
| except: | |
| print("Could not print response content") | |
| if e.response.status_code == 401: | |
| return "Authentication error. Please check your Hugging Face API token." | |
| elif e.response.status_code == 429: | |
| return "Rate limit exceeded. Please try again later." | |
| return f"HTTP error occurred: {e}" | |
| except Exception as e: | |
| print(f"Error querying LLM with context: {e}") | |
| import traceback | |
| print(f"Traceback: {traceback.format_exc()}") | |
| return "Failed to generate an answer with the provided context." | |