Spaces:

RKP64
/

streamlit

Sleeping

App Files Files Community

RKP64 commited on May 28, 2023

Commit

0e1fbf7

1 Parent(s): ec5dade

Upload 20 files

Browse files

Files changed (20) hide show

loaders/__init__.py +0 -0
loaders/__pycache__/__init__.cpython-310.pyc +0 -0
loaders/__pycache__/audio.cpython-310.pyc +0 -0
loaders/__pycache__/common.cpython-310.pyc +0 -0
loaders/__pycache__/csv.cpython-310.pyc +0 -0
loaders/__pycache__/docx.cpython-310.pyc +0 -0
loaders/__pycache__/html.cpython-310.pyc +0 -0
loaders/__pycache__/markdown.cpython-310.pyc +0 -0
loaders/__pycache__/pdf.cpython-310.pyc +0 -0
loaders/__pycache__/powerpoint.cpython-310.pyc +0 -0
loaders/__pycache__/txt.cpython-310.pyc +0 -0
loaders/audio.py +65 -0
loaders/common.py +42 -0
loaders/csv.py +5 -0
loaders/docx.py +5 -0
loaders/html.py +47 -0
loaders/markdown.py +5 -0
loaders/pdf.py +6 -0
loaders/powerpoint.py +5 -0
loaders/txt.py +5 -0

loaders/__init__.py ADDED Viewed

File without changes

loaders/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (148 Bytes). View file

loaders/__pycache__/audio.cpython-310.pyc ADDED Viewed

Binary file (2.39 kB). View file

loaders/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (1.7 kB). View file

loaders/__pycache__/csv.cpython-310.pyc ADDED Viewed

Binary file (429 Bytes). View file

loaders/__pycache__/docx.cpython-310.pyc ADDED Viewed

Binary file (426 Bytes). View file

loaders/__pycache__/html.cpython-310.pyc ADDED Viewed

Binary file (1.97 kB). View file

loaders/__pycache__/markdown.cpython-310.pyc ADDED Viewed

Binary file (444 Bytes). View file

loaders/__pycache__/pdf.cpython-310.pyc ADDED Viewed

Binary file (420 Bytes). View file

loaders/__pycache__/powerpoint.cpython-310.pyc ADDED Viewed

Binary file (452 Bytes). View file

loaders/__pycache__/txt.cpython-310.pyc ADDED Viewed

Binary file (419 Bytes). View file

loaders/audio.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import tempfile
+from io import BytesIO
+import time
+import openai
+import streamlit as st
+from langchain.document_loaders import TextLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from utils import compute_sha1_from_content
+from langchain.schema import Document
+from stats import add_usage
+# Create a function to transcribe audio using Whisper
+def _transcribe_audio(api_key, audio_file, stats_db):
+    openai.api_key = api_key
+    transcript = ""
+    with BytesIO(audio_file.read()) as audio_bytes:
+        # Get the extension of the uploaded file
+        file_extension = os.path.splitext(audio_file.name)[-1]
+        # Create a temporary file with the uploaded audio data and the correct extension
+        with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
+            temp_audio_file.write(audio_bytes.read())
+            temp_audio_file.seek(0)  # Move the file pointer to the beginning of the file
+            # Transcribe the temporary audio file
+            if st.secrets.self_hosted == "false":
+                    add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
+            transcript = openai.Audio.translate("whisper-1", temp_audio_file)
+    return transcript
+def process_audio(vector_store, file_name, stats_db):
+    if st.secrets.self_hosted == "false":
+        if file_name.size > 10000000:
+            st.error("File size is too large. Please upload a file smaller than 1MB.")
+            return
+    file_sha = ""
+    dateshort = time.strftime("%Y%m%d-%H%M%S")
+    file_meta_name = f"audiotranscript_{dateshort}.txt"
+    openai_api_key = st.secrets["openai_api_key"]
+    transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
+    file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
+    ## file size computed from transcript
+    file_size = len(transcript.text.encode("utf-8"))
+    ## Load chunk size and overlap from sidebar
+    chunk_size = st.session_state['chunk_size']
+    chunk_overlap = st.session_state['chunk_overlap']
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    texts = text_splitter.split_text(transcript.text)
+    docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
+    if st.secrets.self_hosted == "false":
+        add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
+    vector_store.add_documents(docs_with_metadata)
+    return vector_store

loaders/common.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import tempfile
+import time
+import os
+from utils import compute_sha1_from_file
+from langchain.schema import Document
+import streamlit as st
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from stats import add_usage
+def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
+    documents = []
+    file_name = file.name
+    file_size = file.size
+    if st.secrets.self_hosted == "false":
+        if file_size > 1000000:
+            st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
+            return
+    dateshort = time.strftime("%Y%m%d")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
+        tmp_file.write(file.getvalue())
+        tmp_file.flush()
+        loader = loader_class(tmp_file.name)
+        documents = loader.load()
+        file_sha1 = compute_sha1_from_file(tmp_file.name)
+    os.remove(tmp_file.name)
+    chunk_size = st.session_state['chunk_size']
+    chunk_overlap = st.session_state['chunk_overlap']
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    documents = text_splitter.split_documents(documents)
+    # Add the document sha1 as metadata to each document
+    docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for doc in documents]
+    vector_store.add_documents(docs_with_metadata)
+    if stats_db:
+        add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})

loaders/csv.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .common import process_file
+from langchain.document_loaders.csv_loader import CSVLoader
+def process_csv(vector_store, file,stats_db):
+    return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)

loaders/docx.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .common import process_file
+from langchain.document_loaders import Docx2txtLoader
+def process_docx(vector_store, file, stats_db):
+    return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)

loaders/html.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from .common import process_file
+from langchain.document_loaders import UnstructuredHTMLLoader
+import requests
+import re
+import unicodedata
+import tempfile
+import os
+import streamlit as st
+from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
+def process_html(vector_store, file, stats_db):
+    return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
+def get_html(url):
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text
+    else:
+        return None
+def create_html_file(url, content):
+    file_name = slugify(url) + ".html"
+    temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
+    with open(temp_file_path, 'w') as temp_file:
+        temp_file.write(content)
+    record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
+    uploaded_file = UploadedFile(record)
+    return uploaded_file, temp_file_path
+def delete_tempfile(temp_file_path, url, ret):
+    try:
+        os.remove(temp_file_path)
+        if ret:
+            st.write(f"✅ Content saved... {url}  ")
+    except OSError as e:
+        print(f"Error while deleting the temporary file: {str(e)}")
+        if ret:
+            st.write(f"❌ Error while saving content... {url}  ")
+def slugify(text):
+    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
+    text = re.sub(r'[^\w\s-]', '', text).strip().lower()
+    text = re.sub(r'[-\s]+', '-', text)
+    return text

loaders/markdown.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .common import process_file
+from langchain.document_loaders import UnstructuredMarkdownLoader
+def process_markdown(vector_store, file, stats_db):
+    return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)

loaders/pdf.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .common import process_file
+from langchain.document_loaders import PyPDFLoader
+def process_pdf(vector_store, file, stats_db):
+    return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)

loaders/powerpoint.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .common import process_file
+from langchain.document_loaders import UnstructuredPowerPointLoader
+def process_powerpoint(vector_store, file, stats_db):
+    return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)

loaders/txt.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .common import process_file
+from langchain.document_loaders import TextLoader
+def process_txt(vector_store, file,stats_db):
+    return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)