Spaces:

kouki321
/

exemple3

Runtime error

App Files Files Community

exemple3 / app.py

kouki321

Update app.py

b5dd82d verified 11 months ago

raw

history blame

12.9 kB

	import os
	import torch
	from fastapi import FastAPI, File, UploadFile, HTTPException, Body
	from fastapi.responses import JSONResponse,RedirectResponse
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers.cache_utils import DynamicCache , StaticCache
	from pydantic import BaseModel
	from typing import Optional
	import uvicorn
	import tempfile
	from time import time
	from pyngrok import ngrok

	os.environ["HF_HOME"] = "/app/hf_cache"
	#os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"

	# Add necessary serialization safety
	torch.serialization.add_safe_globals([DynamicCache])
	torch.serialization.add_safe_globals([set])
	#These lines allow PyTorch to serialize and deserialize these objects without raising errors,
	# #ensuring compatibility and functionality during cache saving/loading.

	# Minimal generate function for token-by-token generation
	def generate(model,
	input_ids,
	past_key_values,
	max_new_tokens=50):
	"""
	This function performs token-by-token text generation using a pre-trained language model.
	Purpose: To generate new text based on input tokens, without loading the full context repeatedly
	Process: It takes a model, input IDs, and cached key-values, then generates new tokens one by one up to the specified maximum
	Performance: Uses the cached key-values for efficiency and returns only the newly generated tokens
	"""
	device = model.model.embed_tokens.weight.device
	origin_len = input_ids.shape[-1]#Stores the length of the input sequence (number of tokens) before text generation begins./return only the newly
	input_ids = input_ids.to(device)#same device as the model.
	output_ids = input_ids.clone()#will be updated during the generation process to include newly generated tokens.
	next_token = input_ids#the token that will process in the next iteration.
	with torch.no_grad():
	for _ in range(max_new_tokens):
	out = model(
	input_ids=next_token,
	past_key_values=past_key_values,
	use_cache=True
	)
	logits = out.logits[:, -1, :]#Extracts the logits for the last token
	token = torch.argmax(logits, dim=-1, keepdim=True)#highest predicted probability as the next token.
	output_ids = torch.cat([output_ids, token], dim=-1)#add the newly generated token
	past_key_values = out.past_key_values
	next_token = token.to(device)
	if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
	break
	return output_ids[:, origin_len:] # Return just the newly generated part

	def get_kv_cache(model, tokenizer, prompt):
	"""
	This function creates a key-value cache for a given prompt.
	Purpose: To pre-compute and store the model's internal representations (key-value states) for a prompt
	Process: Encodes the prompt, runs it through the model, and captures the resulting cache
	Returns: The cache object and the original prompt length for future reference
	"""
	# Encode prompt
	device = model.model.embed_tokens.weight.device
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
	cache = DynamicCache() # it grows as text is generated

	# Run the model to populate the KV cache:
	with torch.no_grad():
	_ = model(
	input_ids=input_ids,
	past_key_values=cache,
	use_cache=True
	)
	return cache, input_ids.shape[-1]

	def clean_up(cache, origin_len):
	# Make a deep copy of the cache first
	new_cache = DynamicCache()
	for i in range(len(cache.key_cache)):
	new_cache.key_cache.append(cache.key_cache[i].clone())
	new_cache.value_cache.append(cache.value_cache[i].clone())

	# Remove any tokens appended to the original knowledge
	for i in range(len(new_cache.key_cache)):
	new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
	new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
	return new_cache
	#os.environ["TRANSFORMERS_OFFLINE"] = "1"
	#os.environ["HF_HUB_OFFLINE"] = "1"

	# Path to your local model

	# Initialize model and tokenizer
	def load_model_and_tokenizer():
	model_name = "Locutusque/TinyMistral-248M"
	#"tiiuae/falcon-rw-1b"
	#"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
	#"facebook/opt-125m"
	# Load tokenizer and model from disk (without trust_remote_code)
	tokenizer = AutoTokenizer.from_pretrained(model_name )
	if torch.cuda.is_available():
	# Load model on GPU if CUDA is available
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto" # Automatically map model layers to GPU

	)
	else:
	# Load model on CPU if no GPU is available
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32, # Use float32 for compatibility with CPU
	low_cpu_mem_usage=True # Reduce memory usage on CPU

	)
	return model, tokenizer

	# Create FastAPI app
	app = FastAPI(title="DeepSeek QA with KV Cache API")


	# Initialize model and tokenizer at startup
	model, tokenizer = load_model_and_tokenizer()
	# Global variables to store the cache, origin length, and model/tokenizer
	cache_store = {}
	class QueryRequest(BaseModel):
	query: str
	max_new_tokens: Optional[int] = 150
	def clean_response(response_text):
	"""
	Clean up model response by removing redundant tags, repetitions, and formatting issues.
	"""
	# First, try to extract just the answer content between tags if they exist
	import re

	# Try to extract content between assistant tags if present
	assistant_pattern = re.compile(r'<\\|assistant\\|>\s(.?)(?:<\/\\|assistant\\|>\|<\\|user\\|>\|<\\|system\\|>)', re.DOTALL)
	matches = assistant_pattern.findall(response_text)

	if matches:
	# Return the first meaningful assistant response
	for match in matches:
	cleaned = match.strip()
	if cleaned and not cleaned.startswith("<\|") and len(cleaned) > 5:
	return cleaned

	# If no proper match found, try more aggressive cleaning
	# Remove all tag markers completely
	cleaned = re.sub(r'<\\|.*?\\|>', '', response_text)
	cleaned = re.sub(r'<\/\\|.*?\\|>', '', cleaned)

	# Remove duplicate lines (common in generated responses)
	lines = cleaned.strip().split('\n')
	unique_lines = []
	for line in lines:
	line = line.strip()
	if line and line not in unique_lines:
	unique_lines.append(line)

	result = '\n'.join(unique_lines)

	# Final cleanup - remove any trailing system/user markers
	result = re.sub(r'<\/?\\|.?\\|>\s$', '', result)

	return result.strip()
	@app.post("/upload-document_to_create_KV_cache")
	async def upload_document(file: UploadFile = File(...)):
	"""Upload a document and create KV cache for it"""
	t1 = time()

	# Save the uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
	temp_file_path = temp_file.name
	content = await file.read()
	temp_file.write(content)

	try:
	# Read the document
	with open(temp_file_path, "r", encoding="utf-8") as f:
	doc_text = f.read()

	# Create system prompt with document context
	system_prompt = f"""
	<\|system\|>
	Answer concisely and precisely, You are an assistant who provides concise factual answers.
	<\|user\|>
	Context:
	{doc_text}
	Question:
	""".strip()

	# Create KV cache
	cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)

	# Generate a unique ID for this document/cache
	cache_id = f"cache_{int(time())}"

	# Store the cache and origin_len
	cache_store[cache_id] = {
	"cache": cache,
	"origin_len": origin_len,
	"doc_preview": doc_text[:500] + "..." if len(doc_text) > 500 else doc_text
	}

	# Clean up the temporary file
	os.unlink(temp_file_path)

	t2 = time()

	return {
	"cache_id": cache_id,
	"message": "Document uploaded and cache created successfully",
	"doc_preview": cache_store[cache_id]["doc_preview"],
	"time_taken": f"{t2 - t1:.4f} seconds"
	}

	except Exception as e:
	# Clean up the temporary file in case of error
	if os.path.exists(temp_file_path):
	os.unlink(temp_file_path)
	raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")

	@app.post("/generate_answer_from_cache/{cache_id}")
	async def generate_answer(cache_id: str, request: QueryRequest):
	"""Generate an answer to a question based on the uploaded document"""
	t1 = time()

	# Check if the document/cache exists
	if cache_id not in cache_store:
	raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")

	try:
	# Get a clean copy of the cache
	current_cache = clean_up(
	cache_store[cache_id]["cache"],
	cache_store[cache_id]["origin_len"]
	)

	# Prepare input with just the query
	full_prompt = f"""
	<\|user\|>
	Question: {request.query}
	<\|assistant\|>
	""".strip()

	input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids

	# Generate response
	output_ids = generate(model, input_ids, current_cache, max_new_tokens=request.max_new_tokens)
	response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	rep = clean_response(response)
	t2 = time()

	return {
	"query": request.query,
	"answer": rep,
	"time_taken": f"{t2 - t1:.4f} seconds"
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error generating answer: {str(e)}")

	@app.post("/save_cache/{cache_id}")
	async def save_cache(cache_id: str):
	"""Save the cache for a document"""
	if cache_id not in cache_store:
	raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")

	try:
	# Clean up the cache and save it
	cleaned_cache = clean_up(
	cache_store[cache_id]["cache"],
	cache_store[cache_id]["origin_len"]
	)

	cache_path = f"{cache_id}_cache.pth"
	torch.save(cleaned_cache, cache_path)

	return {
	"message": f"Cache saved successfully as {cache_path}",
	"cache_path": cache_path
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error saving cache: {str(e)}")

	@app.post("/load_cache")
	async def load_cache(file: UploadFile = File(...)):
	"""Load a previously saved cache"""
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pth") as temp_file:
	temp_file_path = temp_file.name
	content = await file.read()
	temp_file.write(content)

	try:
	# Load the cache
	loaded_cache = torch.load(temp_file_path)

	# Generate a unique ID for this cache
	cache_id = f"loaded_cache_{int(time())}"

	# Store the cache (we don't have the original document text)
	cache_store[cache_id] = {
	"cache": loaded_cache,
	"origin_len": loaded_cache.key_cache[0].shape[-2],
	"doc_preview": "Loaded from cache file"
	}

	# Clean up the temporary file
	os.unlink(temp_file_path)

	return {
	"cache_id": cache_id,
	"message": "Cache loaded successfully"
	}

	except Exception as e:
	# Clean up the temporary file in case of error
	if os.path.exists(temp_file_path):
	os.unlink(temp_file_path)
	raise HTTPException(status_code=500, detail=f"Error loading cache: {str(e)}")

	@app.get("/list_of_caches")
	async def list_documents():
	"""List all uploaded documents/caches"""
	documents = {}
	for cache_id in cache_store:
	documents[cache_id] = {
	"doc_preview": cache_store[cache_id]["doc_preview"],
	"origin_len": cache_store[cache_id]["origin_len"]
	}

	return {"documents": documents}

	@app.get("/", include_in_schema=False)
	async def root():
	return RedirectResponse(url="/docs")

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)