Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- app.py +80 -0
- gpt_utils.py +15 -0
- pi_shard.py +26 -0
- pi_utils.py +23 -0
- pi_vector_utils.py +21 -0
- requirements.txt +6 -0
- style.css +6 -0
app.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from pi_shard import pi_shard, get_pi_digits
|
| 3 |
+
from gpt_utils import analyze_chunk
|
| 4 |
+
from pi_utils import random_pi_fact, generate_pi_graph
|
| 5 |
+
from pi_vector_utils import get_embedding, pi_rotation, pi_modulated_similarity
|
| 6 |
+
import fitz
|
| 7 |
+
import docx
|
| 8 |
+
|
| 9 |
+
st.set_page_config(page_title="Play with Pi", layout="wide")
|
| 10 |
+
st.title("🎲 Play with Pi - π-Based Chunking Engine")
|
| 11 |
+
|
| 12 |
+
st.sidebar.header("🔧 Controls")
|
| 13 |
+
openai_key = st.sidebar.text_input("OpenAI API Key", type="password")
|
| 14 |
+
uploaded_file = st.file_uploader("Upload a document", type=["txt", "pdf", "docx"])
|
| 15 |
+
|
| 16 |
+
if uploaded_file:
|
| 17 |
+
# Handle uploaded file types
|
| 18 |
+
if uploaded_file.name.endswith(".txt"):
|
| 19 |
+
text = uploaded_file.read().decode("utf-8")
|
| 20 |
+
elif uploaded_file.name.endswith(".pdf"):
|
| 21 |
+
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
|
| 22 |
+
text = " ".join([page.get_text() for page in doc])
|
| 23 |
+
elif uploaded_file.name.endswith(".docx"):
|
| 24 |
+
doc = docx.Document(uploaded_file)
|
| 25 |
+
text = "\n".join([para.text for para in doc.paragraphs])
|
| 26 |
+
|
| 27 |
+
st.subheader("📄 Original Document")
|
| 28 |
+
st.text_area("Document Preview", text[:1000] + "...", height=150)
|
| 29 |
+
|
| 30 |
+
# Create π-based chunks
|
| 31 |
+
chunks = pi_shard(text)
|
| 32 |
+
st.subheader(f"🔍 π-Shards (Total: {len(chunks)})")
|
| 33 |
+
selected = st.selectbox("Select Chunk", range(len(chunks)))
|
| 34 |
+
st.code(chunks[selected], language="markdown")
|
| 35 |
+
|
| 36 |
+
# GPT Analysis of Selected Chunk
|
| 37 |
+
if openai_key:
|
| 38 |
+
st.markdown("#### ✨ GPT Analysis")
|
| 39 |
+
if st.button("Analyze Selected Chunk"):
|
| 40 |
+
with st.spinner("Thinking like π..."):
|
| 41 |
+
result = analyze_chunk(chunks[selected], openai_key)
|
| 42 |
+
st.success("Done!")
|
| 43 |
+
st.markdown(result)
|
| 44 |
+
|
| 45 |
+
# Question Answering Section
|
| 46 |
+
st.markdown("#### 🤔 Ask a Question about the Document")
|
| 47 |
+
user_query = st.text_area("Enter your question:", "")
|
| 48 |
+
|
| 49 |
+
if openai_key and st.button("🚀 Submit"):
|
| 50 |
+
if user_query:
|
| 51 |
+
st.info("Generating embeddings and rotating using π...")
|
| 52 |
+
pi_digits = get_pi_digits(len(chunks))
|
| 53 |
+
query_vec = get_embedding(user_query, openai_key)
|
| 54 |
+
|
| 55 |
+
scores = []
|
| 56 |
+
for i, chunk in enumerate(chunks):
|
| 57 |
+
chunk_vec = get_embedding(chunk, openai_key)
|
| 58 |
+
rotated = pi_rotation(chunk_vec, pi_digits[i])
|
| 59 |
+
sim = pi_modulated_similarity(query_vec, rotated, pi_digits[i])
|
| 60 |
+
scores.append((i, sim))
|
| 61 |
+
|
| 62 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
| 63 |
+
top_index = scores[0][0]
|
| 64 |
+
|
| 65 |
+
st.success(f"✅ Best π-Chunk Match (Chunk #{top_index})")
|
| 66 |
+
st.code(chunks[top_index])
|
| 67 |
+
|
| 68 |
+
# Analyze matched chunk with GPT
|
| 69 |
+
st.markdown("#### 📚 GPT Response to Query")
|
| 70 |
+
with st.spinner("Analyzing the matched chunk..."):
|
| 71 |
+
answer = analyze_chunk(chunks[top_index], openai_key)
|
| 72 |
+
st.markdown(answer)
|
| 73 |
+
|
| 74 |
+
# Sidebar - Pi facts and visualization
|
| 75 |
+
st.sidebar.subheader("🎲 Pi Fact")
|
| 76 |
+
st.sidebar.info(random_pi_fact())
|
| 77 |
+
|
| 78 |
+
if st.sidebar.button("🌀 Show π-Graph"):
|
| 79 |
+
fig = generate_pi_graph()
|
| 80 |
+
st.pyplot(fig)
|
gpt_utils.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
|
| 3 |
+
def analyze_chunk(chunk, openai_key):
|
| 4 |
+
openai.api_key = "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"
|
| 5 |
+
try:
|
| 6 |
+
response = openai.ChatCompletion.create(
|
| 7 |
+
model="gpt-4-turbo",
|
| 8 |
+
messages=[
|
| 9 |
+
{"role": "system", "content": "Summarize and analyze the following chunk."},
|
| 10 |
+
{"role": "user", "content": chunk}
|
| 11 |
+
]
|
| 12 |
+
)
|
| 13 |
+
return response['choices'][0]['message']['content']
|
| 14 |
+
except Exception as e:
|
| 15 |
+
return f"Error: {str(e)}"
|
pi_shard.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def clean_text(text):
|
| 4 |
+
return re.sub(r'\s+', ' ', text).strip()
|
| 5 |
+
|
| 6 |
+
def get_pi_digits(n=100):
|
| 7 |
+
with open('assets/pi_digits.txt', 'r') as f:
|
| 8 |
+
digits = f.read().replace('.', '').replace('\n', '')
|
| 9 |
+
return [int(d) for d in digits[:n] if d.isdigit()]
|
| 10 |
+
|
| 11 |
+
def pi_shard(text, max_chunks=50):
|
| 12 |
+
text = clean_text(text)
|
| 13 |
+
pi_digits = get_pi_digits()
|
| 14 |
+
chunks = []
|
| 15 |
+
index = 0
|
| 16 |
+
i = 0
|
| 17 |
+
|
| 18 |
+
while index < len(text) and len(chunks) < max_chunks:
|
| 19 |
+
length = pi_digits[i % len(pi_digits)] + 1
|
| 20 |
+
chunk = text[index:index + length * 50]
|
| 21 |
+
overlap = pi_digits[(i+1) % len(pi_digits)] * 5
|
| 22 |
+
chunks.append(chunk)
|
| 23 |
+
index += length * 50 - overlap
|
| 24 |
+
i += 1
|
| 25 |
+
|
| 26 |
+
return chunks
|
pi_utils.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
|
| 5 |
+
pi_facts = [
|
| 6 |
+
"π is irrational and never ends!",
|
| 7 |
+
"π has been calculated to over 62 trillion digits!",
|
| 8 |
+
"The symbol π was first used in 1706.",
|
| 9 |
+
"You can’t express π as a fraction!",
|
| 10 |
+
"March 14 (3/14) is Pi Day!"
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
def random_pi_fact():
|
| 14 |
+
return random.choice(pi_facts)
|
| 15 |
+
|
| 16 |
+
def generate_pi_graph():
|
| 17 |
+
digits = [int(d) for d in open('assets/pi_digits.txt').read() if d.isdigit()]
|
| 18 |
+
x = list(range(len(digits)))
|
| 19 |
+
y = np.cumsum([(-1)**i * d for i, d in enumerate(digits[:500])])
|
| 20 |
+
fig, ax = plt.subplots()
|
| 21 |
+
ax.plot(x[:len(y)], y)
|
| 22 |
+
ax.set_title("π Waveform based on Digits")
|
| 23 |
+
return fig
|
pi_vector_utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from numpy.linalg import norm
|
| 3 |
+
import openai
|
| 4 |
+
|
| 5 |
+
def get_embedding(text, openai_key):
|
| 6 |
+
openai.api_key = openai_key
|
| 7 |
+
try:
|
| 8 |
+
result = openai.Embedding.create(model="text-embedding-ada-002", input=text)
|
| 9 |
+
return np.array(result['data'][0]['embedding'])
|
| 10 |
+
except Exception as e:
|
| 11 |
+
return np.zeros(1536) # Return zero vector on error
|
| 12 |
+
|
| 13 |
+
def pi_rotation(embedding, pi_digit):
|
| 14 |
+
theta = (pi_digit / 9) * np.pi
|
| 15 |
+
rotated = embedding * np.cos(theta) + np.roll(embedding, 1) * np.sin(theta)
|
| 16 |
+
return rotated
|
| 17 |
+
|
| 18 |
+
def pi_modulated_similarity(query_vec, chunk_vec, pi_digit):
|
| 19 |
+
cosine_sim = np.dot(query_vec, chunk_vec) / (norm(query_vec) * norm(chunk_vec))
|
| 20 |
+
weight = 1 + (pi_digit % 5) * 0.1
|
| 21 |
+
return cosine_sim * weight
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
openai==0.28
|
| 3 |
+
python-docx
|
| 4 |
+
PyMuPDF
|
| 5 |
+
matplotlib
|
| 6 |
+
numpy
|
style.css
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
body {
|
| 2 |
+
background-color: #f0fff0;
|
| 3 |
+
}
|
| 4 |
+
h1, h2 {
|
| 5 |
+
color: #0a0;
|
| 6 |
+
}
|