File size: 4,264 Bytes
680c044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import re
from pathlib import Path
from typing import List
import google.generativeai as genai
from PyPDF2 import PdfReader
from tqdm import tqdm


class GeminiProcessor:

    def __init__(self):
        self.api_key = os.getenv("GOOGLE_API_KEY")
        if not self.api_key:
            raise ValueError("GOOGLE_API_KEY not found")

        # Configure Gemini
        genai.configure(api_key=self.api_key)
        self.model = genai.GenerativeModel('gemini-pro')

    def preprocess_text(self, text: str) -> str:
        """Enhanced preprocessing for screenplay text"""
        # Remove HTML and script tags
        text = re.sub(r'<[^>]+>', '', text)

        # Fix standalone scene headings
        text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)

        # Remove line numbers and (CONT'D)
        text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
        text = re.sub(r'\(CONT\'D\)\d*', '', text)

        # Fix spacing around punctuation
        text = re.sub(r'\s+([.,!?])', r'\1', text)

        # Clean up multiple spaces and line breaks
        text = re.sub(r' +', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)

        # Remove repetitive content
        lines = text.split('\n')
        cleaned_lines = []
        prev_line = None

        for line in lines:
            if not line.strip() or line == prev_line:
                continue
            if line.strip() in ['INT.', 'EXT.', 'INT/EXT.']:
                continue
            cleaned_lines.append(line)
            prev_line = line

        return '\n'.join(cleaned_lines)

    def split_into_scenes(self, text: str) -> list:
        """Split screenplay into scenes while preserving headers and content"""
        # Match scene headers and capture all content until the next header
        scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'

        scenes = re.findall(scene_pattern, text, re.MULTILINE)

        # Clean and validate scenes
        valid_scenes = []
        for scene in scenes:
            scene = scene.strip()
            if scene:
                valid_scenes.append(scene)

        return valid_scenes

    def clean_scene(self, scene: str) -> str:
        """Process a single scene through Gemini"""
        prompt = f"""Fix ONLY spacing and indentation in this screenplay scene.
    DO NOT modify any words or content. DO NOT add or remove lines.
    Keep original capitalization and formatting:

    {scene}"""

        try:
            response = self.model.generate_content(prompt)
            if response.text:
                cleaned = response.text
                # Basic validation
                if abs(len(scene.split()) - len(cleaned.split())) <= 3:
                    return cleaned.strip()
            return scene

        except Exception as e:
            print(f"Error cleaning scene: {str(e)}")
            return scene

    def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
        """Process entire screenplay"""
        try:
            # Read PDF
            with open(pdf_path, 'rb') as file:
                pdf = PdfReader(file)
                text = '\n'.join(page.extract_text() for page in pdf.pages)

            #print("Extracted Text:")
            #print(text)  # This will show you what text was actually extracted from the PDF

            # Initial preprocessing
            text = self.preprocess_text(text)

            # Split into scenes
            scenes = self.split_into_scenes(text)
            print(f"Found {len(scenes)} scenes")

            # Process each scene
            cleaned_scenes = []
            for scene in tqdm(scenes, desc="Processing scenes"):
                cleaned = self.clean_scene(scene)
                if cleaned:
                    cleaned = self.preprocess_text(cleaned)
                    cleaned_scenes.append(cleaned)

            # Save result
            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write('\n\n'.join(cleaned_scenes))

            return True

        except Exception as e:
            print(f"Error processing screenplay: {str(e)}")
            return False