Spaces:
Sleeping
Sleeping
File size: 4,604 Bytes
0952601 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import PyPDF2
import duckdb
import pandas as pd
import os
import random
class PDFProcessor:
def __init__(self):
# Use in-memory database
self.conn = duckdb.connect(':memory:')
self.setup_database()
def setup_database(self):
"""Create the necessary tables if they don't exist"""
self.conn.execute("""
CREATE TABLE IF NOT EXISTS pdf_content (
page_number INTEGER,
content TEXT,
section TEXT,
category TEXT,
embedding TEXT
)
""")
def process_pdf(self, pdf_path):
"""Process PDF and store content in DuckDB"""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
# Extract text from each page
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
# Split text into sections (you might want to adjust this based on your PDF structure)
sections = text.split('\n\n')
# Store each section in the database
for section in sections:
if section.strip():
# Extract category as the first line of the section
lines = section.strip().split('\n')
category = lines[0].strip() if lines else ''
self.conn.execute("""
INSERT INTO pdf_content (page_number, content, section, category)
VALUES (?, ?, ?, ?)
""", [page_num + 1, text, section.strip(), category])
def get_relevant_content(self, query, limit=5):
"""Get relevant content based on a query"""
# For now, we'll do a simple text search
# In a production environment, you might want to use proper embeddings and vector similarity
result = self.conn.execute("""
SELECT DISTINCT content
FROM pdf_content
WHERE content ILIKE '%' || ? || '%'
LIMIT ?
""", [query, limit]).fetchall()
return [row[0] for row in result]
def get_random_sections(self, limit=3):
"""Get random sections from the PDF content"""
result = self.conn.execute("""
SELECT DISTINCT section
FROM pdf_content
WHERE length(section) > 50
ORDER BY random()
LIMIT ?
""", [limit]).fetchall()
return [row[0] for row in result]
def get_random_sections_by_category(self, category='all', limit=3):
"""Get random sections from the PDF content by category (section substring match). If 'all', pick one from each unique category (section) up to the limit."""
if category == 'all':
# Get all unique categories (section names)
categories = self.conn.execute("""
SELECT DISTINCT section
FROM pdf_content
WHERE length(section) > 50
""").fetchall()
categories = [row[0] for row in categories]
random.shuffle(categories)
selected = categories[:limit]
return selected
else:
query = """
SELECT DISTINCT section
FROM pdf_content
WHERE length(section) > 50 AND lower(section) LIKE ?
ORDER BY random()
LIMIT ?
"""
params = [f'%{category.lower()}%', limit]
result = self.conn.execute(query, params).fetchall()
return [row[0] for row in result]
def get_all_categories(self):
"""Return the static list of unique categories for the dropdown."""
categories = [
'All',
'CUSTOMER OBSESSION',
'OWNERSHIP',
'INVENT AND SIMPLIFY',
'ARE RIGHT, A LOT',
'HIRE AND DEVELOP THE BEST',
'INSIST ON THE HIGHEST STANDARDS',
'THINK BIG',
'BIAS FOR ACTION',
'BEING FRUGAL (FRUGALITY)',
'EARN TRUST',
'DIVE DEEP',
'DELIVER RESULTS',
'HAVE BACKBONE: DISAGREE AND COMMIT',
'LEARN & BE CURIOUS',
'SUCCESS & SCALE BRING BROAD RESPONSIBILITY',
"STRIVE TO BE EARTH'S BEST"
]
return categories
def close(self):
"""Close the database connection"""
self.conn.close() |