Spaces:

yalrashed
/

ScriptLLM

Sleeping

App Files Files Community

ScriptLLM / src /analysis /coverage_generator.py

yalrashed

Update src/analysis/coverage_generator.py

042441a verified about 1 year ago

raw

history blame

8.64 kB

	import os
	import google.generativeai as genai
	from pathlib import Path
	from tqdm import tqdm
	import logging

	# Set up logging
	logging.basicConfig(level=logging.DEBUG,
	format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	class CoverageGenerator:
	def __init__(self):
	# Initialize Gemini
	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	raise ValueError("GOOGLE_API_KEY not found")

	genai.configure(api_key=api_key)
	self.model = genai.GenerativeModel('gemini-pro')

	# Add token tracking
	self.token_usage = {
	'prompt_tokens': 0,
	'completion_tokens': 0,
	'total_tokens': 0
	}

	# Set chunk size (in estimated tokens)
	self.chunk_size = 8000 # Conservative size to avoid issues

	def count_tokens(self, text: str) -> int:
	"""Estimate token count using simple word-based estimation"""
	words = text.split()
	return int(len(words) * 1.3)

	def chunk_screenplay(self, text: str) -> list:
	"""Split screenplay into chunks with overlap for context"""
	logger.info("Chunking screenplay...")

	# Split into scenes (looking for standard screenplay headers)
	scenes = text.split("\n\n")

	chunks = []
	current_chunk = []
	current_size = 0
	overlap_scenes = 2 # Number of scenes to overlap

	for i, scene in enumerate(scenes):
	scene_size = self.count_tokens(scene)

	if current_size + scene_size > self.chunk_size and current_chunk:
	# Get overlap scenes from the end of current chunk
	overlap = current_chunk[-overlap_scenes:] if len(current_chunk) > overlap_scenes else current_chunk

	# Join current chunk and add to chunks
	chunks.append("\n\n".join(current_chunk))

	# Start new chunk with overlap for context
	current_chunk = overlap + [scene]
	current_size = sum(self.count_tokens(s) for s in current_chunk)
	else:
	current_chunk.append(scene)
	current_size += scene_size

	# Add the last chunk if it exists
	if current_chunk:
	chunks.append("\n\n".join(current_chunk))

	logger.info(f"Split screenplay into {len(chunks)} chunks with context overlap")
	return chunks

	def read_screenplay(self, filepath: Path) -> str:
	"""Read the cleaned screenplay file"""
	try:
	logger.info(f"Reading screenplay from: {filepath}")
	with open(filepath, 'r', encoding='utf-8') as file:
	text = file.read()
	tokens = self.count_tokens(text)
	logger.info(f"Successfully read screenplay. Length: {tokens} tokens (estimated)")
	return text
	except Exception as e:
	logger.error(f"Error reading screenplay: {e}")
	logger.error(f"Tried to read from: {filepath}")
	return None

	def generate_synopsis(self, chunk: str, chunk_num: int = 1, total_chunks: int = 1) -> str:
	"""Generate synopsis for a single chunk"""
	prompt = f"""As an experienced script analyst, analyze this section ({chunk_num}/{total_chunks}) of the screenplay.

	Important: This section may overlap with others to maintain context. Focus on:
	- Key plot developments and their implications for the larger story
	- Character appearances and development
	- How this section connects to the ongoing narrative
	- Major themes or motifs that emerge

	Provide a summary that captures both the specific events and their significance to the larger narrative.

	Screenplay section:
	{chunk}"""

	try:
	prompt_tokens = self.count_tokens(prompt)
	logger.debug(f"Chunk {chunk_num} prompt length: {prompt_tokens} tokens")

	with tqdm(total=1, desc=f"Processing chunk {chunk_num}/{total_chunks}") as pbar:
	response = self.model.generate_content(prompt)
	completion_tokens = self.count_tokens(response.text)
	pbar.update(1)

	self.token_usage['prompt_tokens'] += prompt_tokens
	self.token_usage['completion_tokens'] += completion_tokens
	self.token_usage['total_tokens'] += (prompt_tokens + completion_tokens)

	return response.text
	except Exception as e:
	logger.error(f"Error processing chunk {chunk_num}: {str(e)}")
	logger.error("Full error details:", exc_info=True)
	return None

	def generate_final_synopsis(self, chunk_synopses: list) -> str:
	"""Combine chunk synopses into a final, coherent synopsis with strong narrative focus"""
	combined_text = "\n\n".join([f"Section {i+1}:\n{synopsis}"
	for i, synopsis in enumerate(chunk_synopses)])

	prompt = f"""As an experienced script analyst, synthesize these section summaries into a comprehensive,
	narratively cohesive synopsis of the entire screenplay.

	You should have distinct sections on:
	1. The complete narrative arc from beginning to end
	2. Character development across the full story
	3. Major themes and how they evolve
	4. Key turning points and their impact
	5. The core conflict and its resolution

	Ensure the synopsis flows naturally and captures the full story without revealing the seams between sections.

	Section summaries:
	{combined_text}"""

	try:
	logger.info("Generating final synopsis")
	with tqdm(total=1, desc="Creating final synopsis") as pbar:
	response = self.model.generate_content(prompt)
	pbar.update(1)
	return response.text
	except Exception as e:
	logger.error(f"Error generating final synopsis: {str(e)}")
	return None

	def generate_coverage(self, screenplay_path: Path) -> bool:
	"""Main method to generate full coverage document"""
	logger.info("Starting coverage generation")

	self.token_usage = {
	'prompt_tokens': 0,
	'completion_tokens': 0,
	'total_tokens': 0
	}

	with tqdm(total=4, desc="Generating coverage") as pbar:
	# Read screenplay
	screenplay_text = self.read_screenplay(screenplay_path)
	if not screenplay_text:
	return False
	pbar.update(1)

	# Split into chunks
	chunks = self.chunk_screenplay(screenplay_text)
	pbar.update(1)

	# Process each chunk
	chunk_synopses = []
	for i, chunk in enumerate(chunks, 1):
	synopsis = self.generate_synopsis(chunk, i, len(chunks))
	if synopsis:
	chunk_synopses.append(synopsis)
	else:
	logger.error(f"Failed to process chunk {i}")
	return False
	pbar.update(1)

	# Generate final synopsis
	final_synopsis = self.generate_final_synopsis(chunk_synopses)
	if not final_synopsis:
	return False

	# Save coverage
	output_dir = screenplay_path.parent
	output_path = output_dir / "coverage.txt"

	try:
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write("SCREENPLAY COVERAGE\n\n")
	f.write("### SYNOPSIS ###\n\n")
	f.write(final_synopsis)

	# Add token usage summary
	f.write("\n\n### TOKEN USAGE SUMMARY ###\n")
	f.write(f"Prompt Tokens: {self.token_usage['prompt_tokens']}\n")
	f.write(f"Completion Tokens: {self.token_usage['completion_tokens']}\n")
	f.write(f"Total Tokens: {self.token_usage['total_tokens']}\n")

	logger.info("\nFinal Token Usage Summary:")
	logger.info(f"Prompt Tokens: {self.token_usage['prompt_tokens']}")
	logger.info(f"Completion Tokens: {self.token_usage['completion_tokens']}")
	logger.info(f"Total Tokens: {self.token_usage['total_tokens']}")

	pbar.update(1)
	return True
	except Exception as e:
	logger.error(f"Error saving coverage: {str(e)}")
	logger.error("Full error details:", exc_info=True)
	return False