RKP64 commited on
Commit
2aeaebd
·
1 Parent(s): 0b68d75

Delete streamlit-demo

Browse files
Files changed (40) hide show
  1. streamlit-demo/.streamlit/secrets.toml +0 -6
  2. streamlit-demo/Dockerfile +0 -25
  3. streamlit-demo/README.md +0 -174
  4. streamlit-demo/__pycache__/brain.cpython-310.pyc +0 -0
  5. streamlit-demo/__pycache__/components_keys.cpython-310.pyc +0 -0
  6. streamlit-demo/__pycache__/explorer.cpython-310.pyc +0 -0
  7. streamlit-demo/__pycache__/files.cpython-310.pyc +0 -0
  8. streamlit-demo/__pycache__/question.cpython-310.pyc +0 -0
  9. streamlit-demo/__pycache__/stats.cpython-310.pyc +0 -0
  10. streamlit-demo/__pycache__/utils.cpython-310.pyc +0 -0
  11. streamlit-demo/app.py +0 -123
  12. streamlit-demo/brain.py +0 -39
  13. streamlit-demo/components_keys.py +0 -4
  14. streamlit-demo/explorer.py +0 -12
  15. streamlit-demo/files.py +0 -191
  16. streamlit-demo/loaders/__init__.py +0 -0
  17. streamlit-demo/loaders/__pycache__/__init__.cpython-310.pyc +0 -0
  18. streamlit-demo/loaders/__pycache__/audio.cpython-310.pyc +0 -0
  19. streamlit-demo/loaders/__pycache__/common.cpython-310.pyc +0 -0
  20. streamlit-demo/loaders/__pycache__/csv.cpython-310.pyc +0 -0
  21. streamlit-demo/loaders/__pycache__/docx.cpython-310.pyc +0 -0
  22. streamlit-demo/loaders/__pycache__/html.cpython-310.pyc +0 -0
  23. streamlit-demo/loaders/__pycache__/markdown.cpython-310.pyc +0 -0
  24. streamlit-demo/loaders/__pycache__/pdf.cpython-310.pyc +0 -0
  25. streamlit-demo/loaders/__pycache__/powerpoint.cpython-310.pyc +0 -0
  26. streamlit-demo/loaders/__pycache__/txt.cpython-310.pyc +0 -0
  27. streamlit-demo/loaders/audio.py +0 -65
  28. streamlit-demo/loaders/common.py +0 -42
  29. streamlit-demo/loaders/csv.py +0 -5
  30. streamlit-demo/loaders/docx.py +0 -5
  31. streamlit-demo/loaders/html.py +0 -47
  32. streamlit-demo/loaders/markdown.py +0 -5
  33. streamlit-demo/loaders/pdf.py +0 -6
  34. streamlit-demo/loaders/powerpoint.py +0 -5
  35. streamlit-demo/loaders/txt.py +0 -5
  36. streamlit-demo/question.py +0 -81
  37. streamlit-demo/requirements.txt +0 -14
  38. streamlit-demo/sidebar.py +0 -11
  39. streamlit-demo/stats.py +0 -31
  40. streamlit-demo/utils.py +0 -11
streamlit-demo/.streamlit/secrets.toml DELETED
@@ -1,6 +0,0 @@
1
- supabase_url = "https://qlvpvyrbyynccpqyljoc.supabase.co"
2
- supabase_service_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InFsdnB2eXJieXluY2NwcXlsam9jIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4NDkxODY4NywiZXhwIjoyMDAwNDk0Njg3fQ.hTDr6FydSOdl0kyFzTiS6mEmkuYXugAAJy_R7eIQIl8"
3
- openai_api_key = "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"
4
- anthropic_api_key = ""
5
- self_hosted = "true"
6
- usage_limit = 2000
 
 
 
 
 
 
 
streamlit-demo/Dockerfile DELETED
@@ -1,25 +0,0 @@
1
- # app/Dockerfile
2
- FROM python:3.11-slim
3
-
4
- WORKDIR /app
5
-
6
- RUN apt-get update && apt-get install -y \
7
- build-essential \
8
- curl \
9
- software-properties-common \
10
- git \
11
- && rm -rf /var/lib/apt/lists/*
12
-
13
- COPY . /app
14
-
15
- ## Mount .streamlit folder to load config.toml and secrets.toml
16
-
17
- RUN pip3 install -r requirements.txt
18
-
19
- EXPOSE 8501
20
-
21
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
-
23
- VOLUME [ "/root/.streamlit" ]
24
-
25
- ENTRYPOINT ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/README.md DELETED
@@ -1,174 +0,0 @@
1
- # Quivr
2
-
3
- <p align="center">
4
- <img src="../logo.png" alt="Quivr-logo" width="30%">
5
- <p align="center">
6
-
7
- <a href="https://discord.gg/HUpRgp2HG8">
8
- <img src="https://img.shields.io/badge/discord-join%20chat-blue.svg" alt="Join our Discord" height="40">
9
- </a>
10
-
11
- Quivr is your second brain in the cloud, designed to easily store and retrieve unstructured information. It's like Obsidian but powered by generative AI.
12
-
13
- ## Features
14
-
15
- - **Store Anything**: Quivr can handle almost any type of data you throw at it. Text, images, code snippets, you name it.
16
- - **Generative AI**: Quivr uses advanced AI to help you generate and retrieve information.
17
- - **Fast and Efficient**: Designed with speed and efficiency in mind. Quivr makes sure you can access your data as quickly as possible.
18
- - **Secure**: Your data is stored securely in the cloud and is always under your control.
19
- - **Compatible Files**:
20
- - **Text**
21
- - **Markdown**
22
- - **PDF**
23
- - **Audio**
24
- - **Video**
25
- - **Open Source**: Quivr is open source and free to use.
26
- ## Demo
27
-
28
-
29
- ### Demo with GPT3.5
30
- https://github.com/StanGirard/quivr/assets/19614572/80721777-2313-468f-b75e-09379f694653
31
-
32
-
33
- ### Demo with Claude 100k context
34
- https://github.com/StanGirard/quivr/assets/5101573/9dba918c-9032-4c8d-9eea-94336d2c8bd4
35
-
36
- ## Getting Started
37
-
38
- These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
39
-
40
- ### Prerequisites
41
-
42
- Make sure you have the following installed before continuing:
43
-
44
- - Python 3.10 or higher
45
- - Pip
46
- - Virtualenv
47
-
48
- You'll also need a [Supabase](https://supabase.com/) account for:
49
-
50
- - A new Supabase project
51
- - Supabase Project API key
52
- - Supabase Project URL
53
-
54
- ### Installing
55
-
56
- - Clone the repository
57
-
58
- ```bash
59
- git clone git@github.com:StanGirard/Quivr.git && cd Quivr
60
- ```
61
-
62
- - Create a virtual environment
63
-
64
- ```bash
65
- virtualenv venv
66
- ```
67
-
68
- - Activate the virtual environment
69
-
70
- ```bash
71
- source venv/bin/activate
72
- ```
73
-
74
- - Install the dependencies
75
-
76
- ```bash
77
- pip install -r requirements.txt
78
- ```
79
-
80
- - Copy the streamlit secrets.toml example file
81
-
82
- ```bash
83
- cp .streamlit/secrets.toml.example .streamlit/secrets.toml
84
- ```
85
-
86
- - Add your credentials to .streamlit/secrets.toml file
87
-
88
- ```toml
89
- supabase_url = "SUPABASE_URL"
90
- supabase_service_key = "SUPABASE_SERVICE_KEY"
91
- openai_api_key = "OPENAI_API_KEY"
92
- anthropic_api_key = "ANTHROPIC_API_KEY" # Optional
93
- ```
94
-
95
- _Note that the `supabase_service_key` is found in your Supabase dashboard under Project Settings -> API. Use the `anon` `public` key found in the `Project API keys` section._
96
-
97
- - Run the following migration scripts on the Supabase database via the web interface (SQL Editor -> `New query`)
98
-
99
- ```sql
100
- -- Enable the pgvector extension to work with embedding vectors
101
- create extension vector;
102
-
103
- -- Create a table to store your documents
104
- create table documents (
105
- id bigserial primary key,
106
- content text, -- corresponds to Document.pageContent
107
- metadata jsonb, -- corresponds to Document.metadata
108
- embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
109
- );
110
-
111
- CREATE FUNCTION match_documents(query_embedding vector(1536), match_count int)
112
- RETURNS TABLE(
113
- id bigint,
114
- content text,
115
- metadata jsonb,
116
- -- we return matched vectors to enable maximal marginal relevance searches
117
- embedding vector(1536),
118
- similarity float)
119
- LANGUAGE plpgsql
120
- AS $$
121
- # variable_conflict use_column
122
- BEGIN
123
- RETURN query
124
- SELECT
125
- id,
126
- content,
127
- metadata,
128
- embedding,
129
- 1 -(documents.embedding <=> query_embedding) AS similarity
130
- FROM
131
- documents
132
- ORDER BY
133
- documents.embedding <=> query_embedding
134
- LIMIT match_count;
135
- END;
136
- $$;
137
- ```
138
-
139
- and
140
-
141
- ```sql
142
- create table
143
- stats (
144
- -- A column called "time" with data type "timestamp"
145
- time timestamp,
146
- -- A column called "details" with data type "text"
147
- chat boolean,
148
- embedding boolean,
149
- details text,
150
- metadata jsonb,
151
- -- An "integer" primary key column called "id" that is generated always as identity
152
- id integer primary key generated always as identity
153
- );
154
- ```
155
-
156
- - Run the app
157
-
158
- ```bash
159
- streamlit run main.py
160
- ```
161
-
162
- ## Built With
163
-
164
- * [NextJS](https://nextjs.org/) - The React framework used.
165
- * [FastAPI](https://fastapi.tiangolo.com/) - The API framework used.
166
- * [Supabase](https://supabase.io/) - The open source Firebase alternative.
167
-
168
- ## Contributing
169
-
170
- Open a pull request and we'll review it as soon as possible.
171
-
172
- ## Star History
173
-
174
- [![Star History Chart](https://api.star-history.com/svg?repos=StanGirard/quivr&type=Date)](https://star-history.com/#StanGirard/quivr&Date)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/__pycache__/brain.cpython-310.pyc DELETED
Binary file (1.82 kB)
 
streamlit-demo/__pycache__/components_keys.cpython-310.pyc DELETED
Binary file (386 Bytes)
 
streamlit-demo/__pycache__/explorer.cpython-310.pyc DELETED
Binary file (516 Bytes)
 
streamlit-demo/__pycache__/files.cpython-310.pyc DELETED
Binary file (5.24 kB)
 
streamlit-demo/__pycache__/question.cpython-310.pyc DELETED
Binary file (2.56 kB)
 
streamlit-demo/__pycache__/stats.cpython-310.pyc DELETED
Binary file (736 Bytes)
 
streamlit-demo/__pycache__/utils.cpython-310.pyc DELETED
Binary file (565 Bytes)
 
streamlit-demo/app.py DELETED
@@ -1,123 +0,0 @@
1
- # main.py
2
- import os
3
- import tempfile
4
-
5
- import streamlit as st
6
- from files import file_uploader, url_uploader
7
- from question import chat_with_doc
8
- from brain import brain
9
- from langchain.embeddings.openai import OpenAIEmbeddings
10
- from langchain.vectorstores import SupabaseVectorStore
11
- from supabase import Client, create_client
12
- from explorer import view_document
13
- from stats import get_usage_today
14
-
15
- supabase_url = st.secrets.supabase_url
16
- supabase_key = st.secrets.supabase_service_key
17
- openai_api_key = st.secrets.openai_api_key
18
- anthropic_api_key = st.secrets.anthropic_api_key
19
- supabase: Client = create_client(supabase_url, supabase_key)
20
- self_hosted = st.secrets.self_hosted
21
-
22
- embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
23
- vector_store = SupabaseVectorStore(
24
- supabase, embeddings, table_name="documents")
25
- models = ["gpt-3.5-turbo", "gpt-4"]
26
- if anthropic_api_key:
27
- models += ["claude-v1", "claude-v1.3",
28
- "claude-instant-v1-100k", "claude-instant-v1.1-100k"]
29
-
30
- # Set the theme
31
- st.set_page_config(
32
- page_title="KPMG GPT",
33
- layout="wide",
34
- initial_sidebar_state="expanded",
35
- )
36
-
37
-
38
- st.title("KPMG GPT")
39
- st.markdown("")
40
- if self_hosted == "false":
41
- st.markdown('**📢 Note: In the public demo, access to functionality is restricted. You can only use the GPT-3.5-turbo model and upload files up to 1Mb. To use more models and upload larger files, consider self-hosting Quivr.**')
42
-
43
- st.markdown("---\n\n")
44
-
45
- st.session_state["overused"] = False
46
- if self_hosted == "false":
47
- usage = get_usage_today(supabase)
48
- if usage > st.secrets.usage_limit:
49
- st.markdown(
50
- f"<span style='color:red'>You have used {usage} tokens today, which is more than your daily limit of {st.secrets.usage_limit} tokens. Please come back later or consider self-hosting.</span>", unsafe_allow_html=True)
51
- st.session_state["overused"] = True
52
- else:
53
- st.markdown(f"<span style='color:blue'>Usage today: {usage} tokens out of {st.secrets.usage_limit}</span>", unsafe_allow_html=True)
54
- st.write("---")
55
-
56
-
57
-
58
-
59
- # Initialize session state variables
60
- if 'model' not in st.session_state:
61
- st.session_state['model'] = "gpt-3.5-turbo"
62
- if 'temperature' not in st.session_state:
63
- st.session_state['temperature'] = 0.0
64
- if 'chunk_size' not in st.session_state:
65
- st.session_state['chunk_size'] = 500
66
- if 'chunk_overlap' not in st.session_state:
67
- st.session_state['chunk_overlap'] = 0
68
- if 'max_tokens' not in st.session_state:
69
- st.session_state['max_tokens'] = 256
70
-
71
- # Create a radio button for user to choose between adding knowledge or asking a question
72
- user_choice = st.radio(
73
- "Choose an action", ('Add Knowledge', 'Chat with your Brain', 'Forget', "Explore"))
74
-
75
- st.markdown("---\n\n")
76
-
77
- if user_choice == 'Add Knowledge':
78
- # Display chunk size and overlap selection only when adding knowledge
79
- st.sidebar.title("Configuration")
80
- st.sidebar.markdown(
81
- "Choose your chunk size and overlap for adding knowledge.")
82
- st.session_state['chunk_size'] = st.sidebar.slider(
83
- "Select Chunk Size", 100, 2000, st.session_state['chunk_size'], 50)
84
- st.session_state['chunk_overlap'] = st.sidebar.slider(
85
- "Select Chunk Overlap", 0, 200, st.session_state['chunk_overlap'], 10)
86
-
87
- # Create two columns for the file uploader and URL uploader
88
- col1, col2 = st.columns(2)
89
-
90
- with col1:
91
- file_uploader(supabase, vector_store)
92
- with col2:
93
- url_uploader(supabase, vector_store)
94
- elif user_choice == 'Chat with your Brain':
95
- # Display model and temperature selection only when asking questions
96
- st.sidebar.title("Configuration")
97
- st.sidebar.markdown(
98
- "Choose your model and temperature for asking questions.")
99
- if self_hosted != "false":
100
- st.session_state['model'] = st.sidebar.selectbox(
101
- "Select Model", models, index=(models).index(st.session_state['model']))
102
- else:
103
- st.sidebar.write("**Model**: gpt-3.5-turbo")
104
- st.sidebar.write("**Self Host to unlock more models such as claude-v1 and GPT4**")
105
- st.session_state['model'] = "gpt-3.5-turbo"
106
- st.session_state['temperature'] = st.sidebar.slider(
107
- "Select Temperature", 0.0, 1.0, st.session_state['temperature'], 0.1)
108
- if st.secrets.self_hosted != "false":
109
- st.session_state['max_tokens'] = st.sidebar.slider(
110
- "Select Max Tokens", 256, 2048, st.session_state['max_tokens'], 2048)
111
- else:
112
- st.session_state['max_tokens'] = 500
113
-
114
- chat_with_doc(st.session_state['model'], vector_store, stats_db=supabase)
115
- elif user_choice == 'Forget':
116
- st.sidebar.title("Configuration")
117
-
118
- brain(supabase)
119
- elif user_choice == 'Explore':
120
- st.sidebar.title("Configuration")
121
- view_document(supabase)
122
-
123
- st.markdown("---\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/brain.py DELETED
@@ -1,39 +0,0 @@
1
- import streamlit as st
2
- import numpy as np
3
-
4
- def brain(supabase):
5
- ## List all documents
6
- response = supabase.table("documents").select("name:metadata->>file_name, size:metadata->>file_size", count="exact").execute()
7
-
8
- documents = response.data # Access the data from the response
9
-
10
- # Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
11
- unique_data = [dict(t) for t in set(tuple(d.items()) for d in documents)]
12
-
13
- # Sort the list of documents by size in decreasing order
14
- unique_data.sort(key=lambda x: int(x['size']), reverse=True)
15
-
16
- # Display some metrics at the top of the page
17
- col1, col2 = st.columns(2)
18
- col1.metric(label="Total Documents", value=len(unique_data))
19
- col2.metric(label="Total Size (bytes)", value=sum(int(doc['size']) for doc in unique_data))
20
-
21
- for document in unique_data:
22
- # Create a unique key for each button by using the document name
23
- button_key = f"delete_{document['name']}"
24
-
25
- # Display the document name, size and the delete button on the same line
26
- col1, col2, col3 = st.columns([3, 1, 1])
27
- col1.markdown(f"**{document['name']}** ({document['size']} bytes)")
28
-
29
- if col2.button('❌', key=button_key):
30
- delete_document(supabase, document['name'])
31
-
32
- def delete_document(supabase, document_name):
33
- # Delete the document from the database
34
- response = supabase.table("documents").delete().match({"metadata->>file_name": document_name}).execute()
35
- # Check if the deletion was successful
36
- if len(response.data) > 0:
37
- st.write(f"✂️ {document_name} was deleted.")
38
- else:
39
- st.write(f"❌ {document_name} was not deleted.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/components_keys.py DELETED
@@ -1,4 +0,0 @@
1
- """Store streamlit component keys"""
2
-
3
- class ComponentsKeys:
4
- FILE_UPLOADER = "file_uploader"
 
 
 
 
 
streamlit-demo/explorer.py DELETED
@@ -1,12 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def view_document(supabase):
5
- # Get the document from the database
6
- response = supabase.table("documents").select("content").execute()
7
- st.write("**This feature is in active development**")
8
- # Display a list of elements from the documents
9
- # If the user clicks on an element, display the content of the document
10
- for document in response.data:
11
- if st.button(document['content'][:50].replace("\n", " ")):
12
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/files.py DELETED
@@ -1,191 +0,0 @@
1
- import os
2
- from typing import (
3
- Any,
4
- Union,
5
- )
6
- import zipfile
7
- import streamlit as st
8
- from streamlit.runtime.uploaded_file_manager import (
9
- UploadedFile,
10
- UploadedFileRec,
11
- UploadedFileManager,
12
- )
13
- from streamlit.runtime.scriptrunner import get_script_run_ctx
14
- from supabase.client import Client
15
- from langchain.vectorstores.supabase import SupabaseVectorStore
16
- from components_keys import ComponentsKeys
17
- from loaders.audio import process_audio
18
- from loaders.txt import process_txt
19
- from loaders.csv import process_csv
20
- from loaders.markdown import process_markdown
21
- from loaders.pdf import process_pdf
22
- from loaders.html import (
23
- create_html_file,
24
- delete_tempfile,
25
- get_html,
26
- process_html,
27
- )
28
- from loaders.powerpoint import process_powerpoint
29
- from loaders.docx import process_docx
30
- from utils import compute_sha1_from_content
31
-
32
-
33
- ctx = get_script_run_ctx()
34
- manager = UploadedFileManager()
35
- file_processors = {
36
- ".txt": process_txt,
37
- ".csv": process_csv,
38
- ".md": process_markdown,
39
- ".markdown": process_markdown,
40
- ".m4a": process_audio,
41
- ".mp3": process_audio,
42
- ".webm": process_audio,
43
- ".mp4": process_audio,
44
- ".mpga": process_audio,
45
- ".wav": process_audio,
46
- ".mpeg": process_audio,
47
- ".pdf": process_pdf,
48
- ".html": process_html,
49
- ".pptx": process_powerpoint,
50
- ".docx": process_docx
51
- }
52
-
53
- def file_uploader(supabase, vector_store):
54
- # Omit zip file support if the `st.secrets.self_hosted` != "true" because
55
- # a zip file can consist of multiple files so the limit on 1 file uploaded
56
- # at a time in the demo can be circumvented.
57
- accepted_file_extensions = list(file_processors.keys())
58
- accept_multiple_files = st.secrets.self_hosted == "true"
59
- if accept_multiple_files:
60
- accepted_file_extensions += [".zip"]
61
-
62
- files = st.file_uploader(
63
- "**Upload a file**",
64
- accept_multiple_files=accept_multiple_files,
65
- type=accepted_file_extensions,
66
- key=ComponentsKeys.FILE_UPLOADER,
67
- )
68
- if st.secrets.self_hosted == "false":
69
- st.markdown("**In demo mode, the max file size is 1MB**")
70
- if st.button("Add to Database"):
71
- # Single file upload
72
- if isinstance(files, UploadedFile):
73
- filter_file(files, supabase, vector_store)
74
- # Multiple files upload
75
- elif isinstance(files, list):
76
- for file in files:
77
- filter_file(file, supabase, vector_store)
78
-
79
- def file_already_exists(supabase, file):
80
- file_sha1 = compute_sha1_from_content(file.getvalue())
81
- response = supabase.table("documents").select("id").eq("metadata->>file_sha1", file_sha1).execute()
82
- return len(response.data) > 0
83
-
84
- def file_to_uploaded_file(file: Any) -> Union[None, UploadedFile]:
85
- """Convert a file to a streamlit `UploadedFile` object.
86
-
87
- This allows us to unzip files and treat them the same way
88
- streamlit treats files uploaded through the file uploader.
89
-
90
- Parameters
91
- ---------
92
- file : Any
93
- The file. Can be any file supported by this app.
94
-
95
- Returns
96
- -------
97
- Union[None, UploadedFile]
98
- The file converted to a streamlit `UploadedFile` object.
99
- Returns `None` if the script context cannot be grabbed.
100
- """
101
-
102
- if ctx is None:
103
- print("script context not found, skipping uploading file:", file.name)
104
- return
105
-
106
- file_extension = os.path.splitext(file.name)[-1]
107
- file_name = file.name
108
- file_data = file.read()
109
- # The file manager will automatically assign an ID so pass `None`
110
- # Reference: https://github.com/streamlit/streamlit/blob/9a6ce804b7977bdc1f18906d1672c45f9a9b3398/lib/streamlit/runtime/uploaded_file_manager.py#LL98C6-L98C6
111
- uploaded_file_rec = UploadedFileRec(None, file_name, file_extension, file_data)
112
- uploaded_file_rec = manager.add_file(
113
- ctx.session_id,
114
- ComponentsKeys.FILE_UPLOADER,
115
- uploaded_file_rec,
116
- )
117
- return UploadedFile(uploaded_file_rec)
118
-
119
- def filter_zip_file(
120
- file: UploadedFile,
121
- supabase: Client,
122
- vector_store: SupabaseVectorStore,
123
- ) -> None:
124
- """Unzip the zip file then filter each unzipped file.
125
-
126
- Parameters
127
- ----------
128
- file : UploadedFile
129
- The uploaded file from the file uploader.
130
- supabase : Client
131
- The supabase client.
132
- vector_store : SupabaseVectorStore
133
- The vector store in the database.
134
- """
135
-
136
- with zipfile.ZipFile(file, "r") as z:
137
- unzipped_files = z.namelist()
138
- for unzipped_file in unzipped_files:
139
- with z.open(unzipped_file, "r") as f:
140
- filter_file(f, supabase, vector_store)
141
-
142
- def filter_file(file, supabase, vector_store):
143
- # Streamlit file uploads are of type `UploadedFile` which has the
144
- # necessary methods and attributes for this app to work.
145
- if not isinstance(file, UploadedFile):
146
- file = file_to_uploaded_file(file)
147
-
148
- file_extension = os.path.splitext(file.name)[-1]
149
- if file_extension == ".zip":
150
- filter_zip_file(file, supabase, vector_store)
151
- return True
152
-
153
- if file_already_exists(supabase, file):
154
- st.write(f"😎 {file.name} is already in the database.")
155
- return False
156
-
157
- if file.size < 1:
158
- st.write(f"💨 {file.name} is empty.")
159
- return False
160
-
161
- if file_extension in file_processors:
162
- if st.secrets.self_hosted == "false":
163
- file_processors[file_extension](vector_store, file, stats_db=supabase)
164
- else:
165
- file_processors[file_extension](vector_store, file, stats_db=None)
166
- st.write(f"✅ {file.name} ")
167
- return True
168
-
169
- st.write(f"❌ {file.name} is not a valid file type.")
170
- return False
171
-
172
- def url_uploader(supabase, vector_store):
173
- url = st.text_area("**Add an url**",placeholder="")
174
- button = st.button("Add the URL to the database")
175
-
176
- if button:
177
- if not st.session_state["overused"]:
178
- html = get_html(url)
179
- if html:
180
- st.write(f"Getting content ... {url} ")
181
- try:
182
- file, temp_file_path = create_html_file(url, html)
183
- except UnicodeEncodeError as e:
184
- st.write(f"❌ Error encoding character: {e}")
185
- file, temp_file_path = create_html_file(url, html)
186
- ret = filter_file(file, supabase, vector_store)
187
- delete_tempfile(temp_file_path, url, ret)
188
- else:
189
- st.write(f"❌ Failed to access to {url} .")
190
- else:
191
- st.write("You have reached your daily limit. Please come back later or self host the solution.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/loaders/__init__.py DELETED
File without changes
streamlit-demo/loaders/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (148 Bytes)
 
streamlit-demo/loaders/__pycache__/audio.cpython-310.pyc DELETED
Binary file (2.39 kB)
 
streamlit-demo/loaders/__pycache__/common.cpython-310.pyc DELETED
Binary file (1.7 kB)
 
streamlit-demo/loaders/__pycache__/csv.cpython-310.pyc DELETED
Binary file (429 Bytes)
 
streamlit-demo/loaders/__pycache__/docx.cpython-310.pyc DELETED
Binary file (426 Bytes)
 
streamlit-demo/loaders/__pycache__/html.cpython-310.pyc DELETED
Binary file (1.97 kB)
 
streamlit-demo/loaders/__pycache__/markdown.cpython-310.pyc DELETED
Binary file (444 Bytes)
 
streamlit-demo/loaders/__pycache__/pdf.cpython-310.pyc DELETED
Binary file (420 Bytes)
 
streamlit-demo/loaders/__pycache__/powerpoint.cpython-310.pyc DELETED
Binary file (452 Bytes)
 
streamlit-demo/loaders/__pycache__/txt.cpython-310.pyc DELETED
Binary file (419 Bytes)
 
streamlit-demo/loaders/audio.py DELETED
@@ -1,65 +0,0 @@
1
- import os
2
- import tempfile
3
- from io import BytesIO
4
- import time
5
- import openai
6
- import streamlit as st
7
- from langchain.document_loaders import TextLoader
8
- from langchain.embeddings.openai import OpenAIEmbeddings
9
- from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- from utils import compute_sha1_from_content
11
- from langchain.schema import Document
12
- from stats import add_usage
13
-
14
-
15
-
16
- # Create a function to transcribe audio using Whisper
17
- def _transcribe_audio(api_key, audio_file, stats_db):
18
- openai.api_key = api_key
19
- transcript = ""
20
-
21
- with BytesIO(audio_file.read()) as audio_bytes:
22
- # Get the extension of the uploaded file
23
- file_extension = os.path.splitext(audio_file.name)[-1]
24
-
25
- # Create a temporary file with the uploaded audio data and the correct extension
26
- with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
27
- temp_audio_file.write(audio_bytes.read())
28
- temp_audio_file.seek(0) # Move the file pointer to the beginning of the file
29
-
30
- # Transcribe the temporary audio file
31
- if st.secrets.self_hosted == "false":
32
- add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
33
-
34
- transcript = openai.Audio.translate("whisper-1", temp_audio_file)
35
-
36
- return transcript
37
-
38
- def process_audio(vector_store, file_name, stats_db):
39
- if st.secrets.self_hosted == "false":
40
- if file_name.size > 10000000:
41
- st.error("File size is too large. Please upload a file smaller than 1MB.")
42
- return
43
- file_sha = ""
44
- dateshort = time.strftime("%Y%m%d-%H%M%S")
45
- file_meta_name = f"audiotranscript_{dateshort}.txt"
46
- openai_api_key = st.secrets["openai_api_key"]
47
- transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
48
- file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
49
- ## file size computed from transcript
50
- file_size = len(transcript.text.encode("utf-8"))
51
-
52
-
53
- ## Load chunk size and overlap from sidebar
54
- chunk_size = st.session_state['chunk_size']
55
- chunk_overlap = st.session_state['chunk_overlap']
56
-
57
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
58
- texts = text_splitter.split_text(transcript.text)
59
-
60
- docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
61
-
62
- if st.secrets.self_hosted == "false":
63
- add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
64
- vector_store.add_documents(docs_with_metadata)
65
- return vector_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/loaders/common.py DELETED
@@ -1,42 +0,0 @@
1
- import tempfile
2
- import time
3
- import os
4
- from utils import compute_sha1_from_file
5
- from langchain.schema import Document
6
- import streamlit as st
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from stats import add_usage
9
-
10
- def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
11
- documents = []
12
- file_name = file.name
13
- file_size = file.size
14
- if st.secrets.self_hosted == "false":
15
- if file_size > 1000000:
16
- st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
17
- return
18
-
19
- dateshort = time.strftime("%Y%m%d")
20
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
21
- tmp_file.write(file.getvalue())
22
- tmp_file.flush()
23
-
24
- loader = loader_class(tmp_file.name)
25
- documents = loader.load()
26
- file_sha1 = compute_sha1_from_file(tmp_file.name)
27
-
28
- os.remove(tmp_file.name)
29
-
30
- chunk_size = st.session_state['chunk_size']
31
- chunk_overlap = st.session_state['chunk_overlap']
32
-
33
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
-
35
- documents = text_splitter.split_documents(documents)
36
-
37
- # Add the document sha1 as metadata to each document
38
- docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for doc in documents]
39
-
40
- vector_store.add_documents(docs_with_metadata)
41
- if stats_db:
42
- add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/loaders/csv.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders.csv_loader import CSVLoader
3
-
4
- def process_csv(vector_store, file,stats_db):
5
- return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)
 
 
 
 
 
 
streamlit-demo/loaders/docx.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import Docx2txtLoader
3
-
4
- def process_docx(vector_store, file, stats_db):
5
- return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)
 
 
 
 
 
 
streamlit-demo/loaders/html.py DELETED
@@ -1,47 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import UnstructuredHTMLLoader
3
- import requests
4
- import re
5
- import unicodedata
6
- import tempfile
7
- import os
8
- import streamlit as st
9
- from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
10
-
11
- def process_html(vector_store, file, stats_db):
12
- return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
13
-
14
-
15
- def get_html(url):
16
- response = requests.get(url)
17
- if response.status_code == 200:
18
- return response.text
19
- else:
20
- return None
21
-
22
- def create_html_file(url, content):
23
- file_name = slugify(url) + ".html"
24
- temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
25
- with open(temp_file_path, 'w') as temp_file:
26
- temp_file.write(content)
27
-
28
- record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
29
- uploaded_file = UploadedFile(record)
30
-
31
- return uploaded_file, temp_file_path
32
-
33
- def delete_tempfile(temp_file_path, url, ret):
34
- try:
35
- os.remove(temp_file_path)
36
- if ret:
37
- st.write(f"✅ Content saved... {url} ")
38
- except OSError as e:
39
- print(f"Error while deleting the temporary file: {str(e)}")
40
- if ret:
41
- st.write(f"❌ Error while saving content... {url} ")
42
-
43
- def slugify(text):
44
- text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
45
- text = re.sub(r'[^\w\s-]', '', text).strip().lower()
46
- text = re.sub(r'[-\s]+', '-', text)
47
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/loaders/markdown.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import UnstructuredMarkdownLoader
3
-
4
- def process_markdown(vector_store, file, stats_db):
5
- return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)
 
 
 
 
 
 
streamlit-demo/loaders/pdf.py DELETED
@@ -1,6 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import PyPDFLoader
3
-
4
-
5
- def process_pdf(vector_store, file, stats_db):
6
- return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)
 
 
 
 
 
 
 
streamlit-demo/loaders/powerpoint.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import UnstructuredPowerPointLoader
3
-
4
- def process_powerpoint(vector_store, file, stats_db):
5
- return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)
 
 
 
 
 
 
streamlit-demo/loaders/txt.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import TextLoader
3
-
4
- def process_txt(vector_store, file,stats_db):
5
- return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)
 
 
 
 
 
 
streamlit-demo/question.py DELETED
@@ -1,81 +0,0 @@
1
- import anthropic
2
- import streamlit as st
3
- from streamlit.logger import get_logger
4
- from langchain.chains import ConversationalRetrievalChain
5
- from langchain.memory import ConversationBufferMemory
6
- from langchain.llms import OpenAI
7
- from langchain.chat_models import ChatAnthropic
8
- from langchain.vectorstores import SupabaseVectorStore
9
- from stats import add_usage
10
-
11
- memory = ConversationBufferMemory(
12
- memory_key="chat_history", return_messages=True)
13
- openai_api_key = st.secrets.openai_api_key
14
- anthropic_api_key = st.secrets.anthropic_api_key
15
- logger = get_logger(__name__)
16
-
17
-
18
- def count_tokens(question, model):
19
- count = f'Words: {len(question.split())}'
20
- if model.startswith("claude"):
21
- count += f' | Tokens: {anthropic.count_tokens(question)}'
22
- return count
23
-
24
-
25
- def chat_with_doc(model, vector_store: SupabaseVectorStore, stats_db):
26
-
27
- if 'chat_history' not in st.session_state:
28
- st.session_state['chat_history'] = []
29
-
30
-
31
-
32
- question = st.text_area("## Ask a question")
33
- columns = st.columns(3)
34
- with columns[0]:
35
- button = st.button("Ask")
36
- with columns[1]:
37
- count_button = st.button("Count Tokens", type='secondary')
38
- with columns[2]:
39
- clear_history = st.button("Clear History", type='secondary')
40
-
41
-
42
-
43
- if clear_history:
44
- # Clear memory in Langchain
45
- memory.clear()
46
- st.session_state['chat_history'] = []
47
- st.experimental_rerun()
48
-
49
- if button:
50
- qa = None
51
- if not st.session_state["overused"]:
52
- add_usage(stats_db, "chat", "prompt" + question, {"model": model, "temperature": st.session_state['temperature']})
53
- if model.startswith("gpt"):
54
- logger.info('Using OpenAI model %s', model)
55
- qa = ConversationalRetrievalChain.from_llm(
56
- OpenAI(
57
- model_name=st.session_state['model'], openai_api_key=openai_api_key, temperature=st.session_state['temperature'], max_tokens=st.session_state['max_tokens']), vector_store.as_retriever(), memory=memory, verbose=True)
58
- elif anthropic_api_key and model.startswith("claude"):
59
- logger.info('Using Anthropics model %s', model)
60
- qa = ConversationalRetrievalChain.from_llm(
61
- ChatAnthropic(
62
- model=st.session_state['model'], anthropic_api_key=anthropic_api_key, temperature=st.session_state['temperature'], max_tokens_to_sample=st.session_state['max_tokens']), vector_store.as_retriever(), memory=memory, verbose=True, max_tokens_limit=102400)
63
-
64
-
65
- st.session_state['chat_history'].append(("You", question))
66
-
67
- # Generate model's response and add it to chat history
68
- model_response = qa({"question": question})
69
- logger.info('Result: %s', model_response)
70
-
71
- st.session_state['chat_history'].append(("KPMG GPT", model_response["answer"]))
72
-
73
- # Display chat history
74
- st.empty()
75
- for speaker, text in st.session_state['chat_history']:
76
- st.markdown(f"**{speaker}:** {text}")
77
- else:
78
- st.error("You have used all your free credits. Please try again later or self host.")
79
-
80
- if count_button:
81
- st.write(count_tokens(question, model))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/requirements.txt DELETED
@@ -1,14 +0,0 @@
1
- langchain==0.0.166
2
- Markdown==3.4.3
3
- openai==0.27.6
4
- pdf2image==1.16.3
5
- pypdf==3.8.1
6
- streamlit==1.22.0
7
- StrEnum==0.4.10
8
- supabase==1.0.3
9
- tiktoken==0.4.0
10
- unstructured==0.6.5
11
- anthropic==0.2.8
12
- fastapi==0.95.2
13
- python-multipart==0.0.6
14
- uvicorn==0.22.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/sidebar.py DELETED
@@ -1,11 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def sidebar(supabase):
5
- st.sidebar.title("Database Information")
6
- number_of_docs = number_of_documents(supabase)
7
- st.sidebar.markdown(f"**Docs in DB:** {number_of_docs}")
8
-
9
- def number_of_documents(supabase):
10
- documents = supabase.table("documents").select("id", count="exact").execute()
11
- return documents.count
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/stats.py DELETED
@@ -1,31 +0,0 @@
1
- from datetime import datetime, timedelta
2
-
3
- # -- Create a table called "stats"
4
- # create table
5
- # stats (
6
- # -- A column called "time" with data type "timestamp"
7
- # time timestamp,
8
- # -- A column called "details" with data type "text"
9
- # chat boolean,
10
- # embedding boolean,
11
- # details text,
12
- # metadata jsonb,
13
- # -- An "integer" primary key column called "id" that is generated always as identity
14
- # id integer primary key generated always as identity
15
- # );
16
-
17
-
18
- def get_usage_today(supabase):
19
- # Returns the number of rows in the stats table for the last 24 hours
20
- response = supabase.table("stats").select("id", count="exact").gte("time", datetime.now() - timedelta(hours=24)).execute()
21
- return response.count
22
-
23
- def add_usage(supabase, type, details, metadata):
24
- # Adds a row to the stats table
25
- supabase.table("stats").insert({
26
- "time": datetime.now().isoformat(),
27
- "chat": type == "chat",
28
- "embedding": type == "embedding",
29
- "details": details,
30
- "metadata": metadata
31
- }).execute()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit-demo/utils.py DELETED
@@ -1,11 +0,0 @@
1
- import hashlib
2
-
3
- def compute_sha1_from_file(file_path):
4
- with open(file_path, "rb") as file:
5
- bytes = file.read()
6
- readable_hash = compute_sha1_from_content(bytes)
7
- return readable_hash
8
-
9
- def compute_sha1_from_content(content):
10
- readable_hash = hashlib.sha1(content).hexdigest()
11
- return readable_hash