RKP64 commited on
Commit
0e1fbf7
·
1 Parent(s): ec5dade

Upload 20 files

Browse files
loaders/__init__.py ADDED
File without changes
loaders/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (148 Bytes). View file
 
loaders/__pycache__/audio.cpython-310.pyc ADDED
Binary file (2.39 kB). View file
 
loaders/__pycache__/common.cpython-310.pyc ADDED
Binary file (1.7 kB). View file
 
loaders/__pycache__/csv.cpython-310.pyc ADDED
Binary file (429 Bytes). View file
 
loaders/__pycache__/docx.cpython-310.pyc ADDED
Binary file (426 Bytes). View file
 
loaders/__pycache__/html.cpython-310.pyc ADDED
Binary file (1.97 kB). View file
 
loaders/__pycache__/markdown.cpython-310.pyc ADDED
Binary file (444 Bytes). View file
 
loaders/__pycache__/pdf.cpython-310.pyc ADDED
Binary file (420 Bytes). View file
 
loaders/__pycache__/powerpoint.cpython-310.pyc ADDED
Binary file (452 Bytes). View file
 
loaders/__pycache__/txt.cpython-310.pyc ADDED
Binary file (419 Bytes). View file
 
loaders/audio.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from io import BytesIO
4
+ import time
5
+ import openai
6
+ import streamlit as st
7
+ from langchain.document_loaders import TextLoader
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from utils import compute_sha1_from_content
11
+ from langchain.schema import Document
12
+ from stats import add_usage
13
+
14
+
15
+
16
+ # Create a function to transcribe audio using Whisper
17
+ def _transcribe_audio(api_key, audio_file, stats_db):
18
+ openai.api_key = api_key
19
+ transcript = ""
20
+
21
+ with BytesIO(audio_file.read()) as audio_bytes:
22
+ # Get the extension of the uploaded file
23
+ file_extension = os.path.splitext(audio_file.name)[-1]
24
+
25
+ # Create a temporary file with the uploaded audio data and the correct extension
26
+ with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
27
+ temp_audio_file.write(audio_bytes.read())
28
+ temp_audio_file.seek(0) # Move the file pointer to the beginning of the file
29
+
30
+ # Transcribe the temporary audio file
31
+ if st.secrets.self_hosted == "false":
32
+ add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
33
+
34
+ transcript = openai.Audio.translate("whisper-1", temp_audio_file)
35
+
36
+ return transcript
37
+
38
+ def process_audio(vector_store, file_name, stats_db):
39
+ if st.secrets.self_hosted == "false":
40
+ if file_name.size > 10000000:
41
+ st.error("File size is too large. Please upload a file smaller than 1MB.")
42
+ return
43
+ file_sha = ""
44
+ dateshort = time.strftime("%Y%m%d-%H%M%S")
45
+ file_meta_name = f"audiotranscript_{dateshort}.txt"
46
+ openai_api_key = st.secrets["openai_api_key"]
47
+ transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
48
+ file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
49
+ ## file size computed from transcript
50
+ file_size = len(transcript.text.encode("utf-8"))
51
+
52
+
53
+ ## Load chunk size and overlap from sidebar
54
+ chunk_size = st.session_state['chunk_size']
55
+ chunk_overlap = st.session_state['chunk_overlap']
56
+
57
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
58
+ texts = text_splitter.split_text(transcript.text)
59
+
60
+ docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
61
+
62
+ if st.secrets.self_hosted == "false":
63
+ add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
64
+ vector_store.add_documents(docs_with_metadata)
65
+ return vector_store
loaders/common.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import time
3
+ import os
4
+ from utils import compute_sha1_from_file
5
+ from langchain.schema import Document
6
+ import streamlit as st
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from stats import add_usage
9
+
10
+ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
11
+ documents = []
12
+ file_name = file.name
13
+ file_size = file.size
14
+ if st.secrets.self_hosted == "false":
15
+ if file_size > 1000000:
16
+ st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
17
+ return
18
+
19
+ dateshort = time.strftime("%Y%m%d")
20
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
21
+ tmp_file.write(file.getvalue())
22
+ tmp_file.flush()
23
+
24
+ loader = loader_class(tmp_file.name)
25
+ documents = loader.load()
26
+ file_sha1 = compute_sha1_from_file(tmp_file.name)
27
+
28
+ os.remove(tmp_file.name)
29
+
30
+ chunk_size = st.session_state['chunk_size']
31
+ chunk_overlap = st.session_state['chunk_overlap']
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
+
35
+ documents = text_splitter.split_documents(documents)
36
+
37
+ # Add the document sha1 as metadata to each document
38
+ docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for doc in documents]
39
+
40
+ vector_store.add_documents(docs_with_metadata)
41
+ if stats_db:
42
+ add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
loaders/csv.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders.csv_loader import CSVLoader
3
+
4
+ def process_csv(vector_store, file,stats_db):
5
+ return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)
loaders/docx.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import Docx2txtLoader
3
+
4
+ def process_docx(vector_store, file, stats_db):
5
+ return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)
loaders/html.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import UnstructuredHTMLLoader
3
+ import requests
4
+ import re
5
+ import unicodedata
6
+ import tempfile
7
+ import os
8
+ import streamlit as st
9
+ from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
10
+
11
+ def process_html(vector_store, file, stats_db):
12
+ return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
13
+
14
+
15
+ def get_html(url):
16
+ response = requests.get(url)
17
+ if response.status_code == 200:
18
+ return response.text
19
+ else:
20
+ return None
21
+
22
+ def create_html_file(url, content):
23
+ file_name = slugify(url) + ".html"
24
+ temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
25
+ with open(temp_file_path, 'w') as temp_file:
26
+ temp_file.write(content)
27
+
28
+ record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
29
+ uploaded_file = UploadedFile(record)
30
+
31
+ return uploaded_file, temp_file_path
32
+
33
+ def delete_tempfile(temp_file_path, url, ret):
34
+ try:
35
+ os.remove(temp_file_path)
36
+ if ret:
37
+ st.write(f"✅ Content saved... {url} ")
38
+ except OSError as e:
39
+ print(f"Error while deleting the temporary file: {str(e)}")
40
+ if ret:
41
+ st.write(f"❌ Error while saving content... {url} ")
42
+
43
+ def slugify(text):
44
+ text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
45
+ text = re.sub(r'[^\w\s-]', '', text).strip().lower()
46
+ text = re.sub(r'[-\s]+', '-', text)
47
+ return text
loaders/markdown.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import UnstructuredMarkdownLoader
3
+
4
+ def process_markdown(vector_store, file, stats_db):
5
+ return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)
loaders/pdf.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import PyPDFLoader
3
+
4
+
5
+ def process_pdf(vector_store, file, stats_db):
6
+ return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)
loaders/powerpoint.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import UnstructuredPowerPointLoader
3
+
4
+ def process_powerpoint(vector_store, file, stats_db):
5
+ return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)
loaders/txt.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import TextLoader
3
+
4
+ def process_txt(vector_store, file,stats_db):
5
+ return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)