Delete streamlit-demo
Browse files- streamlit-demo/.streamlit/secrets.toml +0 -6
- streamlit-demo/Dockerfile +0 -25
- streamlit-demo/README.md +0 -174
- streamlit-demo/__pycache__/brain.cpython-310.pyc +0 -0
- streamlit-demo/__pycache__/components_keys.cpython-310.pyc +0 -0
- streamlit-demo/__pycache__/explorer.cpython-310.pyc +0 -0
- streamlit-demo/__pycache__/files.cpython-310.pyc +0 -0
- streamlit-demo/__pycache__/question.cpython-310.pyc +0 -0
- streamlit-demo/__pycache__/stats.cpython-310.pyc +0 -0
- streamlit-demo/__pycache__/utils.cpython-310.pyc +0 -0
- streamlit-demo/app.py +0 -123
- streamlit-demo/brain.py +0 -39
- streamlit-demo/components_keys.py +0 -4
- streamlit-demo/explorer.py +0 -12
- streamlit-demo/files.py +0 -191
- streamlit-demo/loaders/__init__.py +0 -0
- streamlit-demo/loaders/__pycache__/__init__.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/audio.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/common.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/csv.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/docx.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/html.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/markdown.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/pdf.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/powerpoint.cpython-310.pyc +0 -0
- streamlit-demo/loaders/__pycache__/txt.cpython-310.pyc +0 -0
- streamlit-demo/loaders/audio.py +0 -65
- streamlit-demo/loaders/common.py +0 -42
- streamlit-demo/loaders/csv.py +0 -5
- streamlit-demo/loaders/docx.py +0 -5
- streamlit-demo/loaders/html.py +0 -47
- streamlit-demo/loaders/markdown.py +0 -5
- streamlit-demo/loaders/pdf.py +0 -6
- streamlit-demo/loaders/powerpoint.py +0 -5
- streamlit-demo/loaders/txt.py +0 -5
- streamlit-demo/question.py +0 -81
- streamlit-demo/requirements.txt +0 -14
- streamlit-demo/sidebar.py +0 -11
- streamlit-demo/stats.py +0 -31
- streamlit-demo/utils.py +0 -11
streamlit-demo/.streamlit/secrets.toml
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
supabase_url = "https://qlvpvyrbyynccpqyljoc.supabase.co"
|
| 2 |
-
supabase_service_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InFsdnB2eXJieXluY2NwcXlsam9jIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4NDkxODY4NywiZXhwIjoyMDAwNDk0Njg3fQ.hTDr6FydSOdl0kyFzTiS6mEmkuYXugAAJy_R7eIQIl8"
|
| 3 |
-
openai_api_key = "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"
|
| 4 |
-
anthropic_api_key = ""
|
| 5 |
-
self_hosted = "true"
|
| 6 |
-
usage_limit = 2000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/Dockerfile
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
# app/Dockerfile
|
| 2 |
-
FROM python:3.11-slim
|
| 3 |
-
|
| 4 |
-
WORKDIR /app
|
| 5 |
-
|
| 6 |
-
RUN apt-get update && apt-get install -y \
|
| 7 |
-
build-essential \
|
| 8 |
-
curl \
|
| 9 |
-
software-properties-common \
|
| 10 |
-
git \
|
| 11 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
-
|
| 13 |
-
COPY . /app
|
| 14 |
-
|
| 15 |
-
## Mount .streamlit folder to load config.toml and secrets.toml
|
| 16 |
-
|
| 17 |
-
RUN pip3 install -r requirements.txt
|
| 18 |
-
|
| 19 |
-
EXPOSE 8501
|
| 20 |
-
|
| 21 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 22 |
-
|
| 23 |
-
VOLUME [ "/root/.streamlit" ]
|
| 24 |
-
|
| 25 |
-
ENTRYPOINT ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/README.md
DELETED
|
@@ -1,174 +0,0 @@
|
|
| 1 |
-
# Quivr
|
| 2 |
-
|
| 3 |
-
<p align="center">
|
| 4 |
-
<img src="../logo.png" alt="Quivr-logo" width="30%">
|
| 5 |
-
<p align="center">
|
| 6 |
-
|
| 7 |
-
<a href="https://discord.gg/HUpRgp2HG8">
|
| 8 |
-
<img src="https://img.shields.io/badge/discord-join%20chat-blue.svg" alt="Join our Discord" height="40">
|
| 9 |
-
</a>
|
| 10 |
-
|
| 11 |
-
Quivr is your second brain in the cloud, designed to easily store and retrieve unstructured information. It's like Obsidian but powered by generative AI.
|
| 12 |
-
|
| 13 |
-
## Features
|
| 14 |
-
|
| 15 |
-
- **Store Anything**: Quivr can handle almost any type of data you throw at it. Text, images, code snippets, you name it.
|
| 16 |
-
- **Generative AI**: Quivr uses advanced AI to help you generate and retrieve information.
|
| 17 |
-
- **Fast and Efficient**: Designed with speed and efficiency in mind. Quivr makes sure you can access your data as quickly as possible.
|
| 18 |
-
- **Secure**: Your data is stored securely in the cloud and is always under your control.
|
| 19 |
-
- **Compatible Files**:
|
| 20 |
-
- **Text**
|
| 21 |
-
- **Markdown**
|
| 22 |
-
- **PDF**
|
| 23 |
-
- **Audio**
|
| 24 |
-
- **Video**
|
| 25 |
-
- **Open Source**: Quivr is open source and free to use.
|
| 26 |
-
## Demo
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
### Demo with GPT3.5
|
| 30 |
-
https://github.com/StanGirard/quivr/assets/19614572/80721777-2313-468f-b75e-09379f694653
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
### Demo with Claude 100k context
|
| 34 |
-
https://github.com/StanGirard/quivr/assets/5101573/9dba918c-9032-4c8d-9eea-94336d2c8bd4
|
| 35 |
-
|
| 36 |
-
## Getting Started
|
| 37 |
-
|
| 38 |
-
These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
|
| 39 |
-
|
| 40 |
-
### Prerequisites
|
| 41 |
-
|
| 42 |
-
Make sure you have the following installed before continuing:
|
| 43 |
-
|
| 44 |
-
- Python 3.10 or higher
|
| 45 |
-
- Pip
|
| 46 |
-
- Virtualenv
|
| 47 |
-
|
| 48 |
-
You'll also need a [Supabase](https://supabase.com/) account for:
|
| 49 |
-
|
| 50 |
-
- A new Supabase project
|
| 51 |
-
- Supabase Project API key
|
| 52 |
-
- Supabase Project URL
|
| 53 |
-
|
| 54 |
-
### Installing
|
| 55 |
-
|
| 56 |
-
- Clone the repository
|
| 57 |
-
|
| 58 |
-
```bash
|
| 59 |
-
git clone git@github.com:StanGirard/Quivr.git && cd Quivr
|
| 60 |
-
```
|
| 61 |
-
|
| 62 |
-
- Create a virtual environment
|
| 63 |
-
|
| 64 |
-
```bash
|
| 65 |
-
virtualenv venv
|
| 66 |
-
```
|
| 67 |
-
|
| 68 |
-
- Activate the virtual environment
|
| 69 |
-
|
| 70 |
-
```bash
|
| 71 |
-
source venv/bin/activate
|
| 72 |
-
```
|
| 73 |
-
|
| 74 |
-
- Install the dependencies
|
| 75 |
-
|
| 76 |
-
```bash
|
| 77 |
-
pip install -r requirements.txt
|
| 78 |
-
```
|
| 79 |
-
|
| 80 |
-
- Copy the streamlit secrets.toml example file
|
| 81 |
-
|
| 82 |
-
```bash
|
| 83 |
-
cp .streamlit/secrets.toml.example .streamlit/secrets.toml
|
| 84 |
-
```
|
| 85 |
-
|
| 86 |
-
- Add your credentials to .streamlit/secrets.toml file
|
| 87 |
-
|
| 88 |
-
```toml
|
| 89 |
-
supabase_url = "SUPABASE_URL"
|
| 90 |
-
supabase_service_key = "SUPABASE_SERVICE_KEY"
|
| 91 |
-
openai_api_key = "OPENAI_API_KEY"
|
| 92 |
-
anthropic_api_key = "ANTHROPIC_API_KEY" # Optional
|
| 93 |
-
```
|
| 94 |
-
|
| 95 |
-
_Note that the `supabase_service_key` is found in your Supabase dashboard under Project Settings -> API. Use the `anon` `public` key found in the `Project API keys` section._
|
| 96 |
-
|
| 97 |
-
- Run the following migration scripts on the Supabase database via the web interface (SQL Editor -> `New query`)
|
| 98 |
-
|
| 99 |
-
```sql
|
| 100 |
-
-- Enable the pgvector extension to work with embedding vectors
|
| 101 |
-
create extension vector;
|
| 102 |
-
|
| 103 |
-
-- Create a table to store your documents
|
| 104 |
-
create table documents (
|
| 105 |
-
id bigserial primary key,
|
| 106 |
-
content text, -- corresponds to Document.pageContent
|
| 107 |
-
metadata jsonb, -- corresponds to Document.metadata
|
| 108 |
-
embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
|
| 109 |
-
);
|
| 110 |
-
|
| 111 |
-
CREATE FUNCTION match_documents(query_embedding vector(1536), match_count int)
|
| 112 |
-
RETURNS TABLE(
|
| 113 |
-
id bigint,
|
| 114 |
-
content text,
|
| 115 |
-
metadata jsonb,
|
| 116 |
-
-- we return matched vectors to enable maximal marginal relevance searches
|
| 117 |
-
embedding vector(1536),
|
| 118 |
-
similarity float)
|
| 119 |
-
LANGUAGE plpgsql
|
| 120 |
-
AS $$
|
| 121 |
-
# variable_conflict use_column
|
| 122 |
-
BEGIN
|
| 123 |
-
RETURN query
|
| 124 |
-
SELECT
|
| 125 |
-
id,
|
| 126 |
-
content,
|
| 127 |
-
metadata,
|
| 128 |
-
embedding,
|
| 129 |
-
1 -(documents.embedding <=> query_embedding) AS similarity
|
| 130 |
-
FROM
|
| 131 |
-
documents
|
| 132 |
-
ORDER BY
|
| 133 |
-
documents.embedding <=> query_embedding
|
| 134 |
-
LIMIT match_count;
|
| 135 |
-
END;
|
| 136 |
-
$$;
|
| 137 |
-
```
|
| 138 |
-
|
| 139 |
-
and
|
| 140 |
-
|
| 141 |
-
```sql
|
| 142 |
-
create table
|
| 143 |
-
stats (
|
| 144 |
-
-- A column called "time" with data type "timestamp"
|
| 145 |
-
time timestamp,
|
| 146 |
-
-- A column called "details" with data type "text"
|
| 147 |
-
chat boolean,
|
| 148 |
-
embedding boolean,
|
| 149 |
-
details text,
|
| 150 |
-
metadata jsonb,
|
| 151 |
-
-- An "integer" primary key column called "id" that is generated always as identity
|
| 152 |
-
id integer primary key generated always as identity
|
| 153 |
-
);
|
| 154 |
-
```
|
| 155 |
-
|
| 156 |
-
- Run the app
|
| 157 |
-
|
| 158 |
-
```bash
|
| 159 |
-
streamlit run main.py
|
| 160 |
-
```
|
| 161 |
-
|
| 162 |
-
## Built With
|
| 163 |
-
|
| 164 |
-
* [NextJS](https://nextjs.org/) - The React framework used.
|
| 165 |
-
* [FastAPI](https://fastapi.tiangolo.com/) - The API framework used.
|
| 166 |
-
* [Supabase](https://supabase.io/) - The open source Firebase alternative.
|
| 167 |
-
|
| 168 |
-
## Contributing
|
| 169 |
-
|
| 170 |
-
Open a pull request and we'll review it as soon as possible.
|
| 171 |
-
|
| 172 |
-
## Star History
|
| 173 |
-
|
| 174 |
-
[](https://star-history.com/#StanGirard/quivr&Date)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/__pycache__/brain.cpython-310.pyc
DELETED
|
Binary file (1.82 kB)
|
|
|
streamlit-demo/__pycache__/components_keys.cpython-310.pyc
DELETED
|
Binary file (386 Bytes)
|
|
|
streamlit-demo/__pycache__/explorer.cpython-310.pyc
DELETED
|
Binary file (516 Bytes)
|
|
|
streamlit-demo/__pycache__/files.cpython-310.pyc
DELETED
|
Binary file (5.24 kB)
|
|
|
streamlit-demo/__pycache__/question.cpython-310.pyc
DELETED
|
Binary file (2.56 kB)
|
|
|
streamlit-demo/__pycache__/stats.cpython-310.pyc
DELETED
|
Binary file (736 Bytes)
|
|
|
streamlit-demo/__pycache__/utils.cpython-310.pyc
DELETED
|
Binary file (565 Bytes)
|
|
|
streamlit-demo/app.py
DELETED
|
@@ -1,123 +0,0 @@
|
|
| 1 |
-
# main.py
|
| 2 |
-
import os
|
| 3 |
-
import tempfile
|
| 4 |
-
|
| 5 |
-
import streamlit as st
|
| 6 |
-
from files import file_uploader, url_uploader
|
| 7 |
-
from question import chat_with_doc
|
| 8 |
-
from brain import brain
|
| 9 |
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 10 |
-
from langchain.vectorstores import SupabaseVectorStore
|
| 11 |
-
from supabase import Client, create_client
|
| 12 |
-
from explorer import view_document
|
| 13 |
-
from stats import get_usage_today
|
| 14 |
-
|
| 15 |
-
supabase_url = st.secrets.supabase_url
|
| 16 |
-
supabase_key = st.secrets.supabase_service_key
|
| 17 |
-
openai_api_key = st.secrets.openai_api_key
|
| 18 |
-
anthropic_api_key = st.secrets.anthropic_api_key
|
| 19 |
-
supabase: Client = create_client(supabase_url, supabase_key)
|
| 20 |
-
self_hosted = st.secrets.self_hosted
|
| 21 |
-
|
| 22 |
-
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
| 23 |
-
vector_store = SupabaseVectorStore(
|
| 24 |
-
supabase, embeddings, table_name="documents")
|
| 25 |
-
models = ["gpt-3.5-turbo", "gpt-4"]
|
| 26 |
-
if anthropic_api_key:
|
| 27 |
-
models += ["claude-v1", "claude-v1.3",
|
| 28 |
-
"claude-instant-v1-100k", "claude-instant-v1.1-100k"]
|
| 29 |
-
|
| 30 |
-
# Set the theme
|
| 31 |
-
st.set_page_config(
|
| 32 |
-
page_title="KPMG GPT",
|
| 33 |
-
layout="wide",
|
| 34 |
-
initial_sidebar_state="expanded",
|
| 35 |
-
)
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
st.title("KPMG GPT")
|
| 39 |
-
st.markdown("")
|
| 40 |
-
if self_hosted == "false":
|
| 41 |
-
st.markdown('**📢 Note: In the public demo, access to functionality is restricted. You can only use the GPT-3.5-turbo model and upload files up to 1Mb. To use more models and upload larger files, consider self-hosting Quivr.**')
|
| 42 |
-
|
| 43 |
-
st.markdown("---\n\n")
|
| 44 |
-
|
| 45 |
-
st.session_state["overused"] = False
|
| 46 |
-
if self_hosted == "false":
|
| 47 |
-
usage = get_usage_today(supabase)
|
| 48 |
-
if usage > st.secrets.usage_limit:
|
| 49 |
-
st.markdown(
|
| 50 |
-
f"<span style='color:red'>You have used {usage} tokens today, which is more than your daily limit of {st.secrets.usage_limit} tokens. Please come back later or consider self-hosting.</span>", unsafe_allow_html=True)
|
| 51 |
-
st.session_state["overused"] = True
|
| 52 |
-
else:
|
| 53 |
-
st.markdown(f"<span style='color:blue'>Usage today: {usage} tokens out of {st.secrets.usage_limit}</span>", unsafe_allow_html=True)
|
| 54 |
-
st.write("---")
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
# Initialize session state variables
|
| 60 |
-
if 'model' not in st.session_state:
|
| 61 |
-
st.session_state['model'] = "gpt-3.5-turbo"
|
| 62 |
-
if 'temperature' not in st.session_state:
|
| 63 |
-
st.session_state['temperature'] = 0.0
|
| 64 |
-
if 'chunk_size' not in st.session_state:
|
| 65 |
-
st.session_state['chunk_size'] = 500
|
| 66 |
-
if 'chunk_overlap' not in st.session_state:
|
| 67 |
-
st.session_state['chunk_overlap'] = 0
|
| 68 |
-
if 'max_tokens' not in st.session_state:
|
| 69 |
-
st.session_state['max_tokens'] = 256
|
| 70 |
-
|
| 71 |
-
# Create a radio button for user to choose between adding knowledge or asking a question
|
| 72 |
-
user_choice = st.radio(
|
| 73 |
-
"Choose an action", ('Add Knowledge', 'Chat with your Brain', 'Forget', "Explore"))
|
| 74 |
-
|
| 75 |
-
st.markdown("---\n\n")
|
| 76 |
-
|
| 77 |
-
if user_choice == 'Add Knowledge':
|
| 78 |
-
# Display chunk size and overlap selection only when adding knowledge
|
| 79 |
-
st.sidebar.title("Configuration")
|
| 80 |
-
st.sidebar.markdown(
|
| 81 |
-
"Choose your chunk size and overlap for adding knowledge.")
|
| 82 |
-
st.session_state['chunk_size'] = st.sidebar.slider(
|
| 83 |
-
"Select Chunk Size", 100, 2000, st.session_state['chunk_size'], 50)
|
| 84 |
-
st.session_state['chunk_overlap'] = st.sidebar.slider(
|
| 85 |
-
"Select Chunk Overlap", 0, 200, st.session_state['chunk_overlap'], 10)
|
| 86 |
-
|
| 87 |
-
# Create two columns for the file uploader and URL uploader
|
| 88 |
-
col1, col2 = st.columns(2)
|
| 89 |
-
|
| 90 |
-
with col1:
|
| 91 |
-
file_uploader(supabase, vector_store)
|
| 92 |
-
with col2:
|
| 93 |
-
url_uploader(supabase, vector_store)
|
| 94 |
-
elif user_choice == 'Chat with your Brain':
|
| 95 |
-
# Display model and temperature selection only when asking questions
|
| 96 |
-
st.sidebar.title("Configuration")
|
| 97 |
-
st.sidebar.markdown(
|
| 98 |
-
"Choose your model and temperature for asking questions.")
|
| 99 |
-
if self_hosted != "false":
|
| 100 |
-
st.session_state['model'] = st.sidebar.selectbox(
|
| 101 |
-
"Select Model", models, index=(models).index(st.session_state['model']))
|
| 102 |
-
else:
|
| 103 |
-
st.sidebar.write("**Model**: gpt-3.5-turbo")
|
| 104 |
-
st.sidebar.write("**Self Host to unlock more models such as claude-v1 and GPT4**")
|
| 105 |
-
st.session_state['model'] = "gpt-3.5-turbo"
|
| 106 |
-
st.session_state['temperature'] = st.sidebar.slider(
|
| 107 |
-
"Select Temperature", 0.0, 1.0, st.session_state['temperature'], 0.1)
|
| 108 |
-
if st.secrets.self_hosted != "false":
|
| 109 |
-
st.session_state['max_tokens'] = st.sidebar.slider(
|
| 110 |
-
"Select Max Tokens", 256, 2048, st.session_state['max_tokens'], 2048)
|
| 111 |
-
else:
|
| 112 |
-
st.session_state['max_tokens'] = 500
|
| 113 |
-
|
| 114 |
-
chat_with_doc(st.session_state['model'], vector_store, stats_db=supabase)
|
| 115 |
-
elif user_choice == 'Forget':
|
| 116 |
-
st.sidebar.title("Configuration")
|
| 117 |
-
|
| 118 |
-
brain(supabase)
|
| 119 |
-
elif user_choice == 'Explore':
|
| 120 |
-
st.sidebar.title("Configuration")
|
| 121 |
-
view_document(supabase)
|
| 122 |
-
|
| 123 |
-
st.markdown("---\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/brain.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import numpy as np
|
| 3 |
-
|
| 4 |
-
def brain(supabase):
|
| 5 |
-
## List all documents
|
| 6 |
-
response = supabase.table("documents").select("name:metadata->>file_name, size:metadata->>file_size", count="exact").execute()
|
| 7 |
-
|
| 8 |
-
documents = response.data # Access the data from the response
|
| 9 |
-
|
| 10 |
-
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
|
| 11 |
-
unique_data = [dict(t) for t in set(tuple(d.items()) for d in documents)]
|
| 12 |
-
|
| 13 |
-
# Sort the list of documents by size in decreasing order
|
| 14 |
-
unique_data.sort(key=lambda x: int(x['size']), reverse=True)
|
| 15 |
-
|
| 16 |
-
# Display some metrics at the top of the page
|
| 17 |
-
col1, col2 = st.columns(2)
|
| 18 |
-
col1.metric(label="Total Documents", value=len(unique_data))
|
| 19 |
-
col2.metric(label="Total Size (bytes)", value=sum(int(doc['size']) for doc in unique_data))
|
| 20 |
-
|
| 21 |
-
for document in unique_data:
|
| 22 |
-
# Create a unique key for each button by using the document name
|
| 23 |
-
button_key = f"delete_{document['name']}"
|
| 24 |
-
|
| 25 |
-
# Display the document name, size and the delete button on the same line
|
| 26 |
-
col1, col2, col3 = st.columns([3, 1, 1])
|
| 27 |
-
col1.markdown(f"**{document['name']}** ({document['size']} bytes)")
|
| 28 |
-
|
| 29 |
-
if col2.button('❌', key=button_key):
|
| 30 |
-
delete_document(supabase, document['name'])
|
| 31 |
-
|
| 32 |
-
def delete_document(supabase, document_name):
|
| 33 |
-
# Delete the document from the database
|
| 34 |
-
response = supabase.table("documents").delete().match({"metadata->>file_name": document_name}).execute()
|
| 35 |
-
# Check if the deletion was successful
|
| 36 |
-
if len(response.data) > 0:
|
| 37 |
-
st.write(f"✂️ {document_name} was deleted.")
|
| 38 |
-
else:
|
| 39 |
-
st.write(f"❌ {document_name} was not deleted.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/components_keys.py
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
"""Store streamlit component keys"""
|
| 2 |
-
|
| 3 |
-
class ComponentsKeys:
|
| 4 |
-
FILE_UPLOADER = "file_uploader"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/explorer.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def view_document(supabase):
|
| 5 |
-
# Get the document from the database
|
| 6 |
-
response = supabase.table("documents").select("content").execute()
|
| 7 |
-
st.write("**This feature is in active development**")
|
| 8 |
-
# Display a list of elements from the documents
|
| 9 |
-
# If the user clicks on an element, display the content of the document
|
| 10 |
-
for document in response.data:
|
| 11 |
-
if st.button(document['content'][:50].replace("\n", " ")):
|
| 12 |
-
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/files.py
DELETED
|
@@ -1,191 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from typing import (
|
| 3 |
-
Any,
|
| 4 |
-
Union,
|
| 5 |
-
)
|
| 6 |
-
import zipfile
|
| 7 |
-
import streamlit as st
|
| 8 |
-
from streamlit.runtime.uploaded_file_manager import (
|
| 9 |
-
UploadedFile,
|
| 10 |
-
UploadedFileRec,
|
| 11 |
-
UploadedFileManager,
|
| 12 |
-
)
|
| 13 |
-
from streamlit.runtime.scriptrunner import get_script_run_ctx
|
| 14 |
-
from supabase.client import Client
|
| 15 |
-
from langchain.vectorstores.supabase import SupabaseVectorStore
|
| 16 |
-
from components_keys import ComponentsKeys
|
| 17 |
-
from loaders.audio import process_audio
|
| 18 |
-
from loaders.txt import process_txt
|
| 19 |
-
from loaders.csv import process_csv
|
| 20 |
-
from loaders.markdown import process_markdown
|
| 21 |
-
from loaders.pdf import process_pdf
|
| 22 |
-
from loaders.html import (
|
| 23 |
-
create_html_file,
|
| 24 |
-
delete_tempfile,
|
| 25 |
-
get_html,
|
| 26 |
-
process_html,
|
| 27 |
-
)
|
| 28 |
-
from loaders.powerpoint import process_powerpoint
|
| 29 |
-
from loaders.docx import process_docx
|
| 30 |
-
from utils import compute_sha1_from_content
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
ctx = get_script_run_ctx()
|
| 34 |
-
manager = UploadedFileManager()
|
| 35 |
-
file_processors = {
|
| 36 |
-
".txt": process_txt,
|
| 37 |
-
".csv": process_csv,
|
| 38 |
-
".md": process_markdown,
|
| 39 |
-
".markdown": process_markdown,
|
| 40 |
-
".m4a": process_audio,
|
| 41 |
-
".mp3": process_audio,
|
| 42 |
-
".webm": process_audio,
|
| 43 |
-
".mp4": process_audio,
|
| 44 |
-
".mpga": process_audio,
|
| 45 |
-
".wav": process_audio,
|
| 46 |
-
".mpeg": process_audio,
|
| 47 |
-
".pdf": process_pdf,
|
| 48 |
-
".html": process_html,
|
| 49 |
-
".pptx": process_powerpoint,
|
| 50 |
-
".docx": process_docx
|
| 51 |
-
}
|
| 52 |
-
|
| 53 |
-
def file_uploader(supabase, vector_store):
|
| 54 |
-
# Omit zip file support if the `st.secrets.self_hosted` != "true" because
|
| 55 |
-
# a zip file can consist of multiple files so the limit on 1 file uploaded
|
| 56 |
-
# at a time in the demo can be circumvented.
|
| 57 |
-
accepted_file_extensions = list(file_processors.keys())
|
| 58 |
-
accept_multiple_files = st.secrets.self_hosted == "true"
|
| 59 |
-
if accept_multiple_files:
|
| 60 |
-
accepted_file_extensions += [".zip"]
|
| 61 |
-
|
| 62 |
-
files = st.file_uploader(
|
| 63 |
-
"**Upload a file**",
|
| 64 |
-
accept_multiple_files=accept_multiple_files,
|
| 65 |
-
type=accepted_file_extensions,
|
| 66 |
-
key=ComponentsKeys.FILE_UPLOADER,
|
| 67 |
-
)
|
| 68 |
-
if st.secrets.self_hosted == "false":
|
| 69 |
-
st.markdown("**In demo mode, the max file size is 1MB**")
|
| 70 |
-
if st.button("Add to Database"):
|
| 71 |
-
# Single file upload
|
| 72 |
-
if isinstance(files, UploadedFile):
|
| 73 |
-
filter_file(files, supabase, vector_store)
|
| 74 |
-
# Multiple files upload
|
| 75 |
-
elif isinstance(files, list):
|
| 76 |
-
for file in files:
|
| 77 |
-
filter_file(file, supabase, vector_store)
|
| 78 |
-
|
| 79 |
-
def file_already_exists(supabase, file):
|
| 80 |
-
file_sha1 = compute_sha1_from_content(file.getvalue())
|
| 81 |
-
response = supabase.table("documents").select("id").eq("metadata->>file_sha1", file_sha1).execute()
|
| 82 |
-
return len(response.data) > 0
|
| 83 |
-
|
| 84 |
-
def file_to_uploaded_file(file: Any) -> Union[None, UploadedFile]:
|
| 85 |
-
"""Convert a file to a streamlit `UploadedFile` object.
|
| 86 |
-
|
| 87 |
-
This allows us to unzip files and treat them the same way
|
| 88 |
-
streamlit treats files uploaded through the file uploader.
|
| 89 |
-
|
| 90 |
-
Parameters
|
| 91 |
-
---------
|
| 92 |
-
file : Any
|
| 93 |
-
The file. Can be any file supported by this app.
|
| 94 |
-
|
| 95 |
-
Returns
|
| 96 |
-
-------
|
| 97 |
-
Union[None, UploadedFile]
|
| 98 |
-
The file converted to a streamlit `UploadedFile` object.
|
| 99 |
-
Returns `None` if the script context cannot be grabbed.
|
| 100 |
-
"""
|
| 101 |
-
|
| 102 |
-
if ctx is None:
|
| 103 |
-
print("script context not found, skipping uploading file:", file.name)
|
| 104 |
-
return
|
| 105 |
-
|
| 106 |
-
file_extension = os.path.splitext(file.name)[-1]
|
| 107 |
-
file_name = file.name
|
| 108 |
-
file_data = file.read()
|
| 109 |
-
# The file manager will automatically assign an ID so pass `None`
|
| 110 |
-
# Reference: https://github.com/streamlit/streamlit/blob/9a6ce804b7977bdc1f18906d1672c45f9a9b3398/lib/streamlit/runtime/uploaded_file_manager.py#LL98C6-L98C6
|
| 111 |
-
uploaded_file_rec = UploadedFileRec(None, file_name, file_extension, file_data)
|
| 112 |
-
uploaded_file_rec = manager.add_file(
|
| 113 |
-
ctx.session_id,
|
| 114 |
-
ComponentsKeys.FILE_UPLOADER,
|
| 115 |
-
uploaded_file_rec,
|
| 116 |
-
)
|
| 117 |
-
return UploadedFile(uploaded_file_rec)
|
| 118 |
-
|
| 119 |
-
def filter_zip_file(
|
| 120 |
-
file: UploadedFile,
|
| 121 |
-
supabase: Client,
|
| 122 |
-
vector_store: SupabaseVectorStore,
|
| 123 |
-
) -> None:
|
| 124 |
-
"""Unzip the zip file then filter each unzipped file.
|
| 125 |
-
|
| 126 |
-
Parameters
|
| 127 |
-
----------
|
| 128 |
-
file : UploadedFile
|
| 129 |
-
The uploaded file from the file uploader.
|
| 130 |
-
supabase : Client
|
| 131 |
-
The supabase client.
|
| 132 |
-
vector_store : SupabaseVectorStore
|
| 133 |
-
The vector store in the database.
|
| 134 |
-
"""
|
| 135 |
-
|
| 136 |
-
with zipfile.ZipFile(file, "r") as z:
|
| 137 |
-
unzipped_files = z.namelist()
|
| 138 |
-
for unzipped_file in unzipped_files:
|
| 139 |
-
with z.open(unzipped_file, "r") as f:
|
| 140 |
-
filter_file(f, supabase, vector_store)
|
| 141 |
-
|
| 142 |
-
def filter_file(file, supabase, vector_store):
|
| 143 |
-
# Streamlit file uploads are of type `UploadedFile` which has the
|
| 144 |
-
# necessary methods and attributes for this app to work.
|
| 145 |
-
if not isinstance(file, UploadedFile):
|
| 146 |
-
file = file_to_uploaded_file(file)
|
| 147 |
-
|
| 148 |
-
file_extension = os.path.splitext(file.name)[-1]
|
| 149 |
-
if file_extension == ".zip":
|
| 150 |
-
filter_zip_file(file, supabase, vector_store)
|
| 151 |
-
return True
|
| 152 |
-
|
| 153 |
-
if file_already_exists(supabase, file):
|
| 154 |
-
st.write(f"😎 {file.name} is already in the database.")
|
| 155 |
-
return False
|
| 156 |
-
|
| 157 |
-
if file.size < 1:
|
| 158 |
-
st.write(f"💨 {file.name} is empty.")
|
| 159 |
-
return False
|
| 160 |
-
|
| 161 |
-
if file_extension in file_processors:
|
| 162 |
-
if st.secrets.self_hosted == "false":
|
| 163 |
-
file_processors[file_extension](vector_store, file, stats_db=supabase)
|
| 164 |
-
else:
|
| 165 |
-
file_processors[file_extension](vector_store, file, stats_db=None)
|
| 166 |
-
st.write(f"✅ {file.name} ")
|
| 167 |
-
return True
|
| 168 |
-
|
| 169 |
-
st.write(f"❌ {file.name} is not a valid file type.")
|
| 170 |
-
return False
|
| 171 |
-
|
| 172 |
-
def url_uploader(supabase, vector_store):
|
| 173 |
-
url = st.text_area("**Add an url**",placeholder="")
|
| 174 |
-
button = st.button("Add the URL to the database")
|
| 175 |
-
|
| 176 |
-
if button:
|
| 177 |
-
if not st.session_state["overused"]:
|
| 178 |
-
html = get_html(url)
|
| 179 |
-
if html:
|
| 180 |
-
st.write(f"Getting content ... {url} ")
|
| 181 |
-
try:
|
| 182 |
-
file, temp_file_path = create_html_file(url, html)
|
| 183 |
-
except UnicodeEncodeError as e:
|
| 184 |
-
st.write(f"❌ Error encoding character: {e}")
|
| 185 |
-
file, temp_file_path = create_html_file(url, html)
|
| 186 |
-
ret = filter_file(file, supabase, vector_store)
|
| 187 |
-
delete_tempfile(temp_file_path, url, ret)
|
| 188 |
-
else:
|
| 189 |
-
st.write(f"❌ Failed to access to {url} .")
|
| 190 |
-
else:
|
| 191 |
-
st.write("You have reached your daily limit. Please come back later or self host the solution.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/__init__.py
DELETED
|
File without changes
|
streamlit-demo/loaders/__pycache__/__init__.cpython-310.pyc
DELETED
|
Binary file (148 Bytes)
|
|
|
streamlit-demo/loaders/__pycache__/audio.cpython-310.pyc
DELETED
|
Binary file (2.39 kB)
|
|
|
streamlit-demo/loaders/__pycache__/common.cpython-310.pyc
DELETED
|
Binary file (1.7 kB)
|
|
|
streamlit-demo/loaders/__pycache__/csv.cpython-310.pyc
DELETED
|
Binary file (429 Bytes)
|
|
|
streamlit-demo/loaders/__pycache__/docx.cpython-310.pyc
DELETED
|
Binary file (426 Bytes)
|
|
|
streamlit-demo/loaders/__pycache__/html.cpython-310.pyc
DELETED
|
Binary file (1.97 kB)
|
|
|
streamlit-demo/loaders/__pycache__/markdown.cpython-310.pyc
DELETED
|
Binary file (444 Bytes)
|
|
|
streamlit-demo/loaders/__pycache__/pdf.cpython-310.pyc
DELETED
|
Binary file (420 Bytes)
|
|
|
streamlit-demo/loaders/__pycache__/powerpoint.cpython-310.pyc
DELETED
|
Binary file (452 Bytes)
|
|
|
streamlit-demo/loaders/__pycache__/txt.cpython-310.pyc
DELETED
|
Binary file (419 Bytes)
|
|
|
streamlit-demo/loaders/audio.py
DELETED
|
@@ -1,65 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import tempfile
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
import time
|
| 5 |
-
import openai
|
| 6 |
-
import streamlit as st
|
| 7 |
-
from langchain.document_loaders import TextLoader
|
| 8 |
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 9 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
-
from utils import compute_sha1_from_content
|
| 11 |
-
from langchain.schema import Document
|
| 12 |
-
from stats import add_usage
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
# Create a function to transcribe audio using Whisper
|
| 17 |
-
def _transcribe_audio(api_key, audio_file, stats_db):
|
| 18 |
-
openai.api_key = api_key
|
| 19 |
-
transcript = ""
|
| 20 |
-
|
| 21 |
-
with BytesIO(audio_file.read()) as audio_bytes:
|
| 22 |
-
# Get the extension of the uploaded file
|
| 23 |
-
file_extension = os.path.splitext(audio_file.name)[-1]
|
| 24 |
-
|
| 25 |
-
# Create a temporary file with the uploaded audio data and the correct extension
|
| 26 |
-
with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
|
| 27 |
-
temp_audio_file.write(audio_bytes.read())
|
| 28 |
-
temp_audio_file.seek(0) # Move the file pointer to the beginning of the file
|
| 29 |
-
|
| 30 |
-
# Transcribe the temporary audio file
|
| 31 |
-
if st.secrets.self_hosted == "false":
|
| 32 |
-
add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
|
| 33 |
-
|
| 34 |
-
transcript = openai.Audio.translate("whisper-1", temp_audio_file)
|
| 35 |
-
|
| 36 |
-
return transcript
|
| 37 |
-
|
| 38 |
-
def process_audio(vector_store, file_name, stats_db):
|
| 39 |
-
if st.secrets.self_hosted == "false":
|
| 40 |
-
if file_name.size > 10000000:
|
| 41 |
-
st.error("File size is too large. Please upload a file smaller than 1MB.")
|
| 42 |
-
return
|
| 43 |
-
file_sha = ""
|
| 44 |
-
dateshort = time.strftime("%Y%m%d-%H%M%S")
|
| 45 |
-
file_meta_name = f"audiotranscript_{dateshort}.txt"
|
| 46 |
-
openai_api_key = st.secrets["openai_api_key"]
|
| 47 |
-
transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
|
| 48 |
-
file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
|
| 49 |
-
## file size computed from transcript
|
| 50 |
-
file_size = len(transcript.text.encode("utf-8"))
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
## Load chunk size and overlap from sidebar
|
| 54 |
-
chunk_size = st.session_state['chunk_size']
|
| 55 |
-
chunk_overlap = st.session_state['chunk_overlap']
|
| 56 |
-
|
| 57 |
-
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 58 |
-
texts = text_splitter.split_text(transcript.text)
|
| 59 |
-
|
| 60 |
-
docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
|
| 61 |
-
|
| 62 |
-
if st.secrets.self_hosted == "false":
|
| 63 |
-
add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
| 64 |
-
vector_store.add_documents(docs_with_metadata)
|
| 65 |
-
return vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/common.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
import tempfile
|
| 2 |
-
import time
|
| 3 |
-
import os
|
| 4 |
-
from utils import compute_sha1_from_file
|
| 5 |
-
from langchain.schema import Document
|
| 6 |
-
import streamlit as st
|
| 7 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
-
from stats import add_usage
|
| 9 |
-
|
| 10 |
-
def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
|
| 11 |
-
documents = []
|
| 12 |
-
file_name = file.name
|
| 13 |
-
file_size = file.size
|
| 14 |
-
if st.secrets.self_hosted == "false":
|
| 15 |
-
if file_size > 1000000:
|
| 16 |
-
st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
|
| 17 |
-
return
|
| 18 |
-
|
| 19 |
-
dateshort = time.strftime("%Y%m%d")
|
| 20 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
|
| 21 |
-
tmp_file.write(file.getvalue())
|
| 22 |
-
tmp_file.flush()
|
| 23 |
-
|
| 24 |
-
loader = loader_class(tmp_file.name)
|
| 25 |
-
documents = loader.load()
|
| 26 |
-
file_sha1 = compute_sha1_from_file(tmp_file.name)
|
| 27 |
-
|
| 28 |
-
os.remove(tmp_file.name)
|
| 29 |
-
|
| 30 |
-
chunk_size = st.session_state['chunk_size']
|
| 31 |
-
chunk_overlap = st.session_state['chunk_overlap']
|
| 32 |
-
|
| 33 |
-
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 34 |
-
|
| 35 |
-
documents = text_splitter.split_documents(documents)
|
| 36 |
-
|
| 37 |
-
# Add the document sha1 as metadata to each document
|
| 38 |
-
docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for doc in documents]
|
| 39 |
-
|
| 40 |
-
vector_store.add_documents(docs_with_metadata)
|
| 41 |
-
if stats_db:
|
| 42 |
-
add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/csv.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders.csv_loader import CSVLoader
|
| 3 |
-
|
| 4 |
-
def process_csv(vector_store, file,stats_db):
|
| 5 |
-
return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/docx.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders import Docx2txtLoader
|
| 3 |
-
|
| 4 |
-
def process_docx(vector_store, file, stats_db):
|
| 5 |
-
return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/html.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders import UnstructuredHTMLLoader
|
| 3 |
-
import requests
|
| 4 |
-
import re
|
| 5 |
-
import unicodedata
|
| 6 |
-
import tempfile
|
| 7 |
-
import os
|
| 8 |
-
import streamlit as st
|
| 9 |
-
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
|
| 10 |
-
|
| 11 |
-
def process_html(vector_store, file, stats_db):
|
| 12 |
-
return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def get_html(url):
|
| 16 |
-
response = requests.get(url)
|
| 17 |
-
if response.status_code == 200:
|
| 18 |
-
return response.text
|
| 19 |
-
else:
|
| 20 |
-
return None
|
| 21 |
-
|
| 22 |
-
def create_html_file(url, content):
|
| 23 |
-
file_name = slugify(url) + ".html"
|
| 24 |
-
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
|
| 25 |
-
with open(temp_file_path, 'w') as temp_file:
|
| 26 |
-
temp_file.write(content)
|
| 27 |
-
|
| 28 |
-
record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
|
| 29 |
-
uploaded_file = UploadedFile(record)
|
| 30 |
-
|
| 31 |
-
return uploaded_file, temp_file_path
|
| 32 |
-
|
| 33 |
-
def delete_tempfile(temp_file_path, url, ret):
|
| 34 |
-
try:
|
| 35 |
-
os.remove(temp_file_path)
|
| 36 |
-
if ret:
|
| 37 |
-
st.write(f"✅ Content saved... {url} ")
|
| 38 |
-
except OSError as e:
|
| 39 |
-
print(f"Error while deleting the temporary file: {str(e)}")
|
| 40 |
-
if ret:
|
| 41 |
-
st.write(f"❌ Error while saving content... {url} ")
|
| 42 |
-
|
| 43 |
-
def slugify(text):
|
| 44 |
-
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
|
| 45 |
-
text = re.sub(r'[^\w\s-]', '', text).strip().lower()
|
| 46 |
-
text = re.sub(r'[-\s]+', '-', text)
|
| 47 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/markdown.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders import UnstructuredMarkdownLoader
|
| 3 |
-
|
| 4 |
-
def process_markdown(vector_store, file, stats_db):
|
| 5 |
-
return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/pdf.py
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders import PyPDFLoader
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
def process_pdf(vector_store, file, stats_db):
|
| 6 |
-
return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/powerpoint.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders import UnstructuredPowerPointLoader
|
| 3 |
-
|
| 4 |
-
def process_powerpoint(vector_store, file, stats_db):
|
| 5 |
-
return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/loaders/txt.py
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
from .common import process_file
|
| 2 |
-
from langchain.document_loaders import TextLoader
|
| 3 |
-
|
| 4 |
-
def process_txt(vector_store, file,stats_db):
|
| 5 |
-
return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/question.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
import anthropic
|
| 2 |
-
import streamlit as st
|
| 3 |
-
from streamlit.logger import get_logger
|
| 4 |
-
from langchain.chains import ConversationalRetrievalChain
|
| 5 |
-
from langchain.memory import ConversationBufferMemory
|
| 6 |
-
from langchain.llms import OpenAI
|
| 7 |
-
from langchain.chat_models import ChatAnthropic
|
| 8 |
-
from langchain.vectorstores import SupabaseVectorStore
|
| 9 |
-
from stats import add_usage
|
| 10 |
-
|
| 11 |
-
memory = ConversationBufferMemory(
|
| 12 |
-
memory_key="chat_history", return_messages=True)
|
| 13 |
-
openai_api_key = st.secrets.openai_api_key
|
| 14 |
-
anthropic_api_key = st.secrets.anthropic_api_key
|
| 15 |
-
logger = get_logger(__name__)
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def count_tokens(question, model):
|
| 19 |
-
count = f'Words: {len(question.split())}'
|
| 20 |
-
if model.startswith("claude"):
|
| 21 |
-
count += f' | Tokens: {anthropic.count_tokens(question)}'
|
| 22 |
-
return count
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def chat_with_doc(model, vector_store: SupabaseVectorStore, stats_db):
|
| 26 |
-
|
| 27 |
-
if 'chat_history' not in st.session_state:
|
| 28 |
-
st.session_state['chat_history'] = []
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
question = st.text_area("## Ask a question")
|
| 33 |
-
columns = st.columns(3)
|
| 34 |
-
with columns[0]:
|
| 35 |
-
button = st.button("Ask")
|
| 36 |
-
with columns[1]:
|
| 37 |
-
count_button = st.button("Count Tokens", type='secondary')
|
| 38 |
-
with columns[2]:
|
| 39 |
-
clear_history = st.button("Clear History", type='secondary')
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
if clear_history:
|
| 44 |
-
# Clear memory in Langchain
|
| 45 |
-
memory.clear()
|
| 46 |
-
st.session_state['chat_history'] = []
|
| 47 |
-
st.experimental_rerun()
|
| 48 |
-
|
| 49 |
-
if button:
|
| 50 |
-
qa = None
|
| 51 |
-
if not st.session_state["overused"]:
|
| 52 |
-
add_usage(stats_db, "chat", "prompt" + question, {"model": model, "temperature": st.session_state['temperature']})
|
| 53 |
-
if model.startswith("gpt"):
|
| 54 |
-
logger.info('Using OpenAI model %s', model)
|
| 55 |
-
qa = ConversationalRetrievalChain.from_llm(
|
| 56 |
-
OpenAI(
|
| 57 |
-
model_name=st.session_state['model'], openai_api_key=openai_api_key, temperature=st.session_state['temperature'], max_tokens=st.session_state['max_tokens']), vector_store.as_retriever(), memory=memory, verbose=True)
|
| 58 |
-
elif anthropic_api_key and model.startswith("claude"):
|
| 59 |
-
logger.info('Using Anthropics model %s', model)
|
| 60 |
-
qa = ConversationalRetrievalChain.from_llm(
|
| 61 |
-
ChatAnthropic(
|
| 62 |
-
model=st.session_state['model'], anthropic_api_key=anthropic_api_key, temperature=st.session_state['temperature'], max_tokens_to_sample=st.session_state['max_tokens']), vector_store.as_retriever(), memory=memory, verbose=True, max_tokens_limit=102400)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
st.session_state['chat_history'].append(("You", question))
|
| 66 |
-
|
| 67 |
-
# Generate model's response and add it to chat history
|
| 68 |
-
model_response = qa({"question": question})
|
| 69 |
-
logger.info('Result: %s', model_response)
|
| 70 |
-
|
| 71 |
-
st.session_state['chat_history'].append(("KPMG GPT", model_response["answer"]))
|
| 72 |
-
|
| 73 |
-
# Display chat history
|
| 74 |
-
st.empty()
|
| 75 |
-
for speaker, text in st.session_state['chat_history']:
|
| 76 |
-
st.markdown(f"**{speaker}:** {text}")
|
| 77 |
-
else:
|
| 78 |
-
st.error("You have used all your free credits. Please try again later or self host.")
|
| 79 |
-
|
| 80 |
-
if count_button:
|
| 81 |
-
st.write(count_tokens(question, model))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/requirements.txt
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
langchain==0.0.166
|
| 2 |
-
Markdown==3.4.3
|
| 3 |
-
openai==0.27.6
|
| 4 |
-
pdf2image==1.16.3
|
| 5 |
-
pypdf==3.8.1
|
| 6 |
-
streamlit==1.22.0
|
| 7 |
-
StrEnum==0.4.10
|
| 8 |
-
supabase==1.0.3
|
| 9 |
-
tiktoken==0.4.0
|
| 10 |
-
unstructured==0.6.5
|
| 11 |
-
anthropic==0.2.8
|
| 12 |
-
fastapi==0.95.2
|
| 13 |
-
python-multipart==0.0.6
|
| 14 |
-
uvicorn==0.22.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/sidebar.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def sidebar(supabase):
|
| 5 |
-
st.sidebar.title("Database Information")
|
| 6 |
-
number_of_docs = number_of_documents(supabase)
|
| 7 |
-
st.sidebar.markdown(f"**Docs in DB:** {number_of_docs}")
|
| 8 |
-
|
| 9 |
-
def number_of_documents(supabase):
|
| 10 |
-
documents = supabase.table("documents").select("id", count="exact").execute()
|
| 11 |
-
return documents.count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/stats.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
from datetime import datetime, timedelta
|
| 2 |
-
|
| 3 |
-
# -- Create a table called "stats"
|
| 4 |
-
# create table
|
| 5 |
-
# stats (
|
| 6 |
-
# -- A column called "time" with data type "timestamp"
|
| 7 |
-
# time timestamp,
|
| 8 |
-
# -- A column called "details" with data type "text"
|
| 9 |
-
# chat boolean,
|
| 10 |
-
# embedding boolean,
|
| 11 |
-
# details text,
|
| 12 |
-
# metadata jsonb,
|
| 13 |
-
# -- An "integer" primary key column called "id" that is generated always as identity
|
| 14 |
-
# id integer primary key generated always as identity
|
| 15 |
-
# );
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def get_usage_today(supabase):
|
| 19 |
-
# Returns the number of rows in the stats table for the last 24 hours
|
| 20 |
-
response = supabase.table("stats").select("id", count="exact").gte("time", datetime.now() - timedelta(hours=24)).execute()
|
| 21 |
-
return response.count
|
| 22 |
-
|
| 23 |
-
def add_usage(supabase, type, details, metadata):
|
| 24 |
-
# Adds a row to the stats table
|
| 25 |
-
supabase.table("stats").insert({
|
| 26 |
-
"time": datetime.now().isoformat(),
|
| 27 |
-
"chat": type == "chat",
|
| 28 |
-
"embedding": type == "embedding",
|
| 29 |
-
"details": details,
|
| 30 |
-
"metadata": metadata
|
| 31 |
-
}).execute()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit-demo/utils.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
import hashlib
|
| 2 |
-
|
| 3 |
-
def compute_sha1_from_file(file_path):
|
| 4 |
-
with open(file_path, "rb") as file:
|
| 5 |
-
bytes = file.read()
|
| 6 |
-
readable_hash = compute_sha1_from_content(bytes)
|
| 7 |
-
return readable_hash
|
| 8 |
-
|
| 9 |
-
def compute_sha1_from_content(content):
|
| 10 |
-
readable_hash = hashlib.sha1(content).hexdigest()
|
| 11 |
-
return readable_hash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|