File size: 4,604 Bytes
0952601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import PyPDF2
import duckdb
import pandas as pd
import os
import random

class PDFProcessor:
    def __init__(self):
        # Use in-memory database
        self.conn = duckdb.connect(':memory:')
        self.setup_database()

    def setup_database(self):
        """Create the necessary tables if they don't exist"""
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS pdf_content (
                page_number INTEGER,
                content TEXT,
                section TEXT,
                category TEXT,
                embedding TEXT
            )
        """)

    def process_pdf(self, pdf_path):
        """Process PDF and store content in DuckDB"""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            
            # Extract text from each page
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text = page.extract_text()
                
                # Split text into sections (you might want to adjust this based on your PDF structure)
                sections = text.split('\n\n')
                
                # Store each section in the database
                for section in sections:
                    if section.strip():
                        # Extract category as the first line of the section
                        lines = section.strip().split('\n')
                        category = lines[0].strip() if lines else ''
                        self.conn.execute("""
                            INSERT INTO pdf_content (page_number, content, section, category)
                            VALUES (?, ?, ?, ?)
                        """, [page_num + 1, text, section.strip(), category])

    def get_relevant_content(self, query, limit=5):
        """Get relevant content based on a query"""
        # For now, we'll do a simple text search
        # In a production environment, you might want to use proper embeddings and vector similarity
        result = self.conn.execute("""
            SELECT DISTINCT content
            FROM pdf_content
            WHERE content ILIKE '%' || ? || '%'
            LIMIT ?
        """, [query, limit]).fetchall()
        
        return [row[0] for row in result]

    def get_random_sections(self, limit=3):
        """Get random sections from the PDF content"""
        result = self.conn.execute("""
            SELECT DISTINCT section
            FROM pdf_content
            WHERE length(section) > 50
            ORDER BY random()
            LIMIT ?
        """, [limit]).fetchall()
        
        return [row[0] for row in result]

    def get_random_sections_by_category(self, category='all', limit=3):
        """Get random sections from the PDF content by category (section substring match). If 'all', pick one from each unique category (section) up to the limit."""
        if category == 'all':
            # Get all unique categories (section names)
            categories = self.conn.execute("""
                SELECT DISTINCT section
                FROM pdf_content
                WHERE length(section) > 50
            """).fetchall()
            categories = [row[0] for row in categories]
            random.shuffle(categories)
            selected = categories[:limit]
            return selected
        else:
            query = """
                SELECT DISTINCT section
                FROM pdf_content
                WHERE length(section) > 50 AND lower(section) LIKE ?
                ORDER BY random()
                LIMIT ?
            """
            params = [f'%{category.lower()}%', limit]
            result = self.conn.execute(query, params).fetchall()
            return [row[0] for row in result]

    def get_all_categories(self):
        """Return the static list of unique categories for the dropdown."""
        categories = [
            'All',
            'CUSTOMER OBSESSION',
            'OWNERSHIP',
            'INVENT AND SIMPLIFY',
            'ARE RIGHT, A LOT',
            'HIRE AND DEVELOP THE BEST',
            'INSIST ON THE HIGHEST STANDARDS',
            'THINK BIG',
            'BIAS FOR ACTION',
            'BEING FRUGAL (FRUGALITY)',
            'EARN TRUST',
            'DIVE DEEP',
            'DELIVER RESULTS',
            'HAVE BACKBONE: DISAGREE AND COMMIT',
            'LEARN & BE CURIOUS',
            'SUCCESS & SCALE BRING BROAD RESPONSIBILITY',
            "STRIVE TO BE EARTH'S BEST"
        ]
        return categories

    def close(self):
        """Close the database connection"""
        self.conn.close()