ScriptLLM / src /processing /gemini_processor.py
yalrashed's picture
Update src/processing/gemini_processor.py
680c044 verified
raw
history blame
4.26 kB
import os
import re
from pathlib import Path
from typing import List
import google.generativeai as genai
from PyPDF2 import PdfReader
from tqdm import tqdm
class GeminiProcessor:
def __init__(self):
self.api_key = os.getenv("GOOGLE_API_KEY")
if not self.api_key:
raise ValueError("GOOGLE_API_KEY not found")
# Configure Gemini
genai.configure(api_key=self.api_key)
self.model = genai.GenerativeModel('gemini-pro')
def preprocess_text(self, text: str) -> str:
"""Enhanced preprocessing for screenplay text"""
# Remove HTML and script tags
text = re.sub(r'<[^>]+>', '', text)
# Fix standalone scene headings
text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
# Remove line numbers and (CONT'D)
text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
text = re.sub(r'\(CONT\'D\)\d*', '', text)
# Fix spacing around punctuation
text = re.sub(r'\s+([.,!?])', r'\1', text)
# Clean up multiple spaces and line breaks
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
# Remove repetitive content
lines = text.split('\n')
cleaned_lines = []
prev_line = None
for line in lines:
if not line.strip() or line == prev_line:
continue
if line.strip() in ['INT.', 'EXT.', 'INT/EXT.']:
continue
cleaned_lines.append(line)
prev_line = line
return '\n'.join(cleaned_lines)
def split_into_scenes(self, text: str) -> list:
"""Split screenplay into scenes while preserving headers and content"""
# Match scene headers and capture all content until the next header
scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
scenes = re.findall(scene_pattern, text, re.MULTILINE)
# Clean and validate scenes
valid_scenes = []
for scene in scenes:
scene = scene.strip()
if scene:
valid_scenes.append(scene)
return valid_scenes
def clean_scene(self, scene: str) -> str:
"""Process a single scene through Gemini"""
prompt = f"""Fix ONLY spacing and indentation in this screenplay scene.
DO NOT modify any words or content. DO NOT add or remove lines.
Keep original capitalization and formatting:
{scene}"""
try:
response = self.model.generate_content(prompt)
if response.text:
cleaned = response.text
# Basic validation
if abs(len(scene.split()) - len(cleaned.split())) <= 3:
return cleaned.strip()
return scene
except Exception as e:
print(f"Error cleaning scene: {str(e)}")
return scene
def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
"""Process entire screenplay"""
try:
# Read PDF
with open(pdf_path, 'rb') as file:
pdf = PdfReader(file)
text = '\n'.join(page.extract_text() for page in pdf.pages)
#print("Extracted Text:")
#print(text) # This will show you what text was actually extracted from the PDF
# Initial preprocessing
text = self.preprocess_text(text)
# Split into scenes
scenes = self.split_into_scenes(text)
print(f"Found {len(scenes)} scenes")
# Process each scene
cleaned_scenes = []
for scene in tqdm(scenes, desc="Processing scenes"):
cleaned = self.clean_scene(scene)
if cleaned:
cleaned = self.preprocess_text(cleaned)
cleaned_scenes.append(cleaned)
# Save result
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(cleaned_scenes))
return True
except Exception as e:
print(f"Error processing screenplay: {str(e)}")
return False