Spaces:

yalrashed
/

ScriptLLM

Sleeping

App Files Files Community

ScriptLLM / src /processing /gemini_processor.py

yalrashed

Update src/processing/gemini_processor.py

680c044 verified about 1 year ago

raw

history blame

4.26 kB

	import os
	import re
	from pathlib import Path
	from typing import List
	import google.generativeai as genai
	from PyPDF2 import PdfReader
	from tqdm import tqdm


	class GeminiProcessor:

	def __init__(self):
	self.api_key = os.getenv("GOOGLE_API_KEY")
	if not self.api_key:
	raise ValueError("GOOGLE_API_KEY not found")

	# Configure Gemini
	genai.configure(api_key=self.api_key)
	self.model = genai.GenerativeModel('gemini-pro')

	def preprocess_text(self, text: str) -> str:
	"""Enhanced preprocessing for screenplay text"""
	# Remove HTML and script tags
	text = re.sub(r'<[^>]+>', '', text)

	# Fix standalone scene headings
	text = re.sub(r'\n(INT\.\|EXT\.\|INT\/EXT\.)\s*\n', '', text)

	# Remove line numbers and (CONT'D)
	text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
	text = re.sub(r'$CONT\'D$\d*', '', text)

	# Fix spacing around punctuation
	text = re.sub(r'\s+([.,!?])', r'\1', text)

	# Clean up multiple spaces and line breaks
	text = re.sub(r' +', ' ', text)
	text = re.sub(r'\n{3,}', '\n\n', text)

	# Remove repetitive content
	lines = text.split('\n')
	cleaned_lines = []
	prev_line = None

	for line in lines:
	if not line.strip() or line == prev_line:
	continue
	if line.strip() in ['INT.', 'EXT.', 'INT/EXT.']:
	continue
	cleaned_lines.append(line)
	prev_line = line

	return '\n'.join(cleaned_lines)

	def split_into_scenes(self, text: str) -> list:
	"""Split screenplay into scenes while preserving headers and content"""
	# Match scene headers and capture all content until the next header
	scene_pattern = r'((?:INT\.\|EXT\.\|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.\|EXT\.\|INT\/EXT\.))[^\n]+\n)*)'

	scenes = re.findall(scene_pattern, text, re.MULTILINE)

	# Clean and validate scenes
	valid_scenes = []
	for scene in scenes:
	scene = scene.strip()
	if scene:
	valid_scenes.append(scene)

	return valid_scenes

	def clean_scene(self, scene: str) -> str:
	"""Process a single scene through Gemini"""
	prompt = f"""Fix ONLY spacing and indentation in this screenplay scene.
	DO NOT modify any words or content. DO NOT add or remove lines.
	Keep original capitalization and formatting:

	{scene}"""

	try:
	response = self.model.generate_content(prompt)
	if response.text:
	cleaned = response.text
	# Basic validation
	if abs(len(scene.split()) - len(cleaned.split())) <= 3:
	return cleaned.strip()
	return scene

	except Exception as e:
	print(f"Error cleaning scene: {str(e)}")
	return scene

	def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
	"""Process entire screenplay"""
	try:
	# Read PDF
	with open(pdf_path, 'rb') as file:
	pdf = PdfReader(file)
	text = '\n'.join(page.extract_text() for page in pdf.pages)

	#print("Extracted Text:")
	#print(text) # This will show you what text was actually extracted from the PDF

	# Initial preprocessing
	text = self.preprocess_text(text)

	# Split into scenes
	scenes = self.split_into_scenes(text)
	print(f"Found {len(scenes)} scenes")

	# Process each scene
	cleaned_scenes = []
	for scene in tqdm(scenes, desc="Processing scenes"):
	cleaned = self.clean_scene(scene)
	if cleaned:
	cleaned = self.preprocess_text(cleaned)
	cleaned_scenes.append(cleaned)

	# Save result
	Path(output_path).parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write('\n\n'.join(cleaned_scenes))

	return True

	except Exception as e:
	print(f"Error processing screenplay: {str(e)}")
	return False