HAOUARI Noureddine
commited on
Commit
·
d0341c5
1
Parent(s):
11fb0d6
better version 02
Browse files
app.py
CHANGED
|
@@ -11,8 +11,6 @@ encoding_anthropic = client.get_tokenizer()
|
|
| 11 |
|
| 12 |
# Model choice and max tokens input
|
| 13 |
model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
|
| 14 |
-
max_tokens = st.sidebar.number_input(
|
| 15 |
-
"Max number of tokens per chunk", min_value=100, value=8000)
|
| 16 |
|
| 17 |
|
| 18 |
def clean_text_content(text):
|
|
@@ -61,30 +59,60 @@ def pdf_to_text(pdf_files_data, file_names):
|
|
| 61 |
return results
|
| 62 |
|
| 63 |
|
| 64 |
-
st.title("PDF
|
| 65 |
-
st.markdown(
|
| 66 |
-
"Upload PDF files and get their content in text format splitted based on the max tokens.")
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
uploaded_files = st.sidebar.file_uploader(
|
| 70 |
-
"Upload PDF files", type="pdf", accept_multiple_files=True)
|
| 71 |
-
|
| 72 |
-
clean_text = st.sidebar.checkbox("Clean text before encoding and splitting?")
|
| 73 |
-
|
| 74 |
-
# Check if the text is not already in session_state
|
| 75 |
if "text_content" not in st.session_state:
|
| 76 |
st.session_state.text_content = ""
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
if st.
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
if clean_text:
|
| 90 |
st.session_state.text_content = clean_text_content(
|
|
@@ -95,16 +123,8 @@ if uploaded_files:
|
|
| 95 |
chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
|
| 96 |
chunk_tokens) for chunk_tokens in chunks_generator]
|
| 97 |
|
| 98 |
-
# Display each chunk in a separate text area
|
| 99 |
for i, chunk in enumerate(chunks, 1):
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
if model_choice == "OpenAI":
|
| 105 |
-
num_tokens = len(encoding_openAI.encode(st.session_state.text_content))
|
| 106 |
-
st.write(f"Total number of tokens (OpenAI): {num_tokens}")
|
| 107 |
-
else:
|
| 108 |
-
tokens_count = len(encoding_anthropic.encode(
|
| 109 |
-
st.session_state.text_content))
|
| 110 |
-
st.write(f"Total number of tokens (Anthropic): {tokens_count}")
|
|
|
|
| 11 |
|
| 12 |
# Model choice and max tokens input
|
| 13 |
model_choice = st.sidebar.selectbox("Choose a Model", ["OpenAI", "Anthropic"])
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def clean_text_content(text):
|
|
|
|
| 59 |
return results
|
| 60 |
|
| 61 |
|
| 62 |
+
st.title("PDF Utility")
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
# Create tabs
|
| 65 |
+
step01 = "Step 01: Upload Files"
|
| 66 |
+
step02 = "Step 02: Edit Knowledge Base"
|
| 67 |
+
step03 = "Step 03: Split text"
|
| 68 |
+
tabs = [step01, step02, step03]
|
| 69 |
+
selected_tab = st.sidebar.radio("Choose a tab", tabs)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
if "text_content" not in st.session_state:
|
| 72 |
st.session_state.text_content = ""
|
| 73 |
|
| 74 |
+
# Define content for each tab
|
| 75 |
+
if selected_tab == step02:
|
| 76 |
+
st.subheader("Knowledge Base Text Area")
|
| 77 |
+
st.session_state.text_content = st.text_area(
|
| 78 |
+
"Knowledge Text Area", st.session_state.text_content, height=400)
|
| 79 |
+
if st.button("Compute Tokens"):
|
| 80 |
+
if model_choice == "OpenAI":
|
| 81 |
+
num_tokens = len(encoding_openAI.encode(
|
| 82 |
+
st.session_state.text_content))
|
| 83 |
+
st.write(f"Total number of tokens (OpenAI): {num_tokens}")
|
| 84 |
+
else:
|
| 85 |
+
tokens_count = len(encoding_anthropic.encode(
|
| 86 |
+
st.session_state.text_content))
|
| 87 |
+
st.write(f"Total number of tokens (Anthropic): {tokens_count}")
|
| 88 |
+
elif selected_tab == step01:
|
| 89 |
+
st.subheader("Upload PDFs to Append to Knowledge Base")
|
| 90 |
+
|
| 91 |
+
uploaded_files = st.file_uploader(
|
| 92 |
+
"Upload PDF files", type="pdf", accept_multiple_files=True)
|
| 93 |
+
if uploaded_files:
|
| 94 |
+
pdf_files_data = [io.BytesIO(uploaded_file.read())
|
| 95 |
+
for uploaded_file in uploaded_files]
|
| 96 |
+
file_names = [uploaded_file.name for uploaded_file in uploaded_files]
|
| 97 |
+
|
| 98 |
+
if st.button('Convert and add to knowledge database'):
|
| 99 |
+
converting_message = st.text("Converting PDFs...")
|
| 100 |
+
converted_text = "\n".join(pdf_to_text(pdf_files_data, file_names))
|
| 101 |
+
st.session_state.text_content += converted_text
|
| 102 |
+
converting_message.empty()
|
| 103 |
+
|
| 104 |
+
elif selected_tab == step03:
|
| 105 |
+
st.subheader("Splitting Options")
|
| 106 |
+
|
| 107 |
+
model_choice = st.selectbox(
|
| 108 |
+
"Choose a Model", ["OpenAI", "Anthropic"], key="model_choice_selectbox")
|
| 109 |
+
max_tokens = st.number_input(
|
| 110 |
+
"Max number of tokens per chunk", min_value=100, value=8000, key="max_tokens_input")
|
| 111 |
+
clean_text = st.checkbox("Clean text before encoding and splitting?")
|
| 112 |
+
|
| 113 |
+
# Add prefix and postfix input options
|
| 114 |
+
prefix = st.text_area("Prefix for each chunk:", "")
|
| 115 |
+
postfix = st.text_area("Postfix for each chunk:", "")
|
| 116 |
|
| 117 |
if clean_text:
|
| 118 |
st.session_state.text_content = clean_text_content(
|
|
|
|
| 123 |
chunks = [encoding_openAI.decode(chunk_tokens) if model_choice == "OpenAI" else encoding_anthropic.decode(
|
| 124 |
chunk_tokens) for chunk_tokens in chunks_generator]
|
| 125 |
|
|
|
|
| 126 |
for i, chunk in enumerate(chunks, 1):
|
| 127 |
+
# Add prefix and postfix to each chunk
|
| 128 |
+
chunk_with_affixes = f"{prefix}{chunk}{postfix}"
|
| 129 |
+
chunk_content = st.text_area(
|
| 130 |
+
f"Chunk {i} content:", chunk_with_affixes, height=200)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|