File size: 4,264 Bytes
680c044 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import os
import re
from pathlib import Path
from typing import List
import google.generativeai as genai
from PyPDF2 import PdfReader
from tqdm import tqdm
class GeminiProcessor:
def __init__(self):
self.api_key = os.getenv("GOOGLE_API_KEY")
if not self.api_key:
raise ValueError("GOOGLE_API_KEY not found")
# Configure Gemini
genai.configure(api_key=self.api_key)
self.model = genai.GenerativeModel('gemini-pro')
def preprocess_text(self, text: str) -> str:
"""Enhanced preprocessing for screenplay text"""
# Remove HTML and script tags
text = re.sub(r'<[^>]+>', '', text)
# Fix standalone scene headings
text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
# Remove line numbers and (CONT'D)
text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
text = re.sub(r'\(CONT\'D\)\d*', '', text)
# Fix spacing around punctuation
text = re.sub(r'\s+([.,!?])', r'\1', text)
# Clean up multiple spaces and line breaks
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
# Remove repetitive content
lines = text.split('\n')
cleaned_lines = []
prev_line = None
for line in lines:
if not line.strip() or line == prev_line:
continue
if line.strip() in ['INT.', 'EXT.', 'INT/EXT.']:
continue
cleaned_lines.append(line)
prev_line = line
return '\n'.join(cleaned_lines)
def split_into_scenes(self, text: str) -> list:
"""Split screenplay into scenes while preserving headers and content"""
# Match scene headers and capture all content until the next header
scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
scenes = re.findall(scene_pattern, text, re.MULTILINE)
# Clean and validate scenes
valid_scenes = []
for scene in scenes:
scene = scene.strip()
if scene:
valid_scenes.append(scene)
return valid_scenes
def clean_scene(self, scene: str) -> str:
"""Process a single scene through Gemini"""
prompt = f"""Fix ONLY spacing and indentation in this screenplay scene.
DO NOT modify any words or content. DO NOT add or remove lines.
Keep original capitalization and formatting:
{scene}"""
try:
response = self.model.generate_content(prompt)
if response.text:
cleaned = response.text
# Basic validation
if abs(len(scene.split()) - len(cleaned.split())) <= 3:
return cleaned.strip()
return scene
except Exception as e:
print(f"Error cleaning scene: {str(e)}")
return scene
def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
"""Process entire screenplay"""
try:
# Read PDF
with open(pdf_path, 'rb') as file:
pdf = PdfReader(file)
text = '\n'.join(page.extract_text() for page in pdf.pages)
#print("Extracted Text:")
#print(text) # This will show you what text was actually extracted from the PDF
# Initial preprocessing
text = self.preprocess_text(text)
# Split into scenes
scenes = self.split_into_scenes(text)
print(f"Found {len(scenes)} scenes")
# Process each scene
cleaned_scenes = []
for scene in tqdm(scenes, desc="Processing scenes"):
cleaned = self.clean_scene(scene)
if cleaned:
cleaned = self.preprocess_text(cleaned)
cleaned_scenes.append(cleaned)
# Save result
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(cleaned_scenes))
return True
except Exception as e:
print(f"Error processing screenplay: {str(e)}")
return False
|