Spaces:
Sleeping
Sleeping
feat: support for word docs
Browse files- app.py +9 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
| 3 |
import PyPDF2
|
|
@@ -32,6 +33,12 @@ def load_pdf_file(file):
|
|
| 32 |
return pdf_text
|
| 33 |
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def split_text_into_chunks(text, max_chunk_length):
|
| 36 |
chunks = []
|
| 37 |
current_chunk = ""
|
|
@@ -84,6 +91,8 @@ def main():
|
|
| 84 |
_, file_ext = os.path.splitext(file_name)
|
| 85 |
if "pdf" in file_ext:
|
| 86 |
sentence = load_pdf_file(uploaded_file)
|
|
|
|
|
|
|
| 87 |
else:
|
| 88 |
sentence = load_text_file(uploaded_file)
|
| 89 |
st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
|
|
|
|
| 1 |
+
import docx
|
| 2 |
import streamlit as st
|
| 3 |
import os
|
| 4 |
import PyPDF2
|
|
|
|
| 33 |
return pdf_text
|
| 34 |
|
| 35 |
|
| 36 |
+
def load_word_file(file):
|
| 37 |
+
doc = docx.Document(file)
|
| 38 |
+
paragraphs = [p.text for p in doc.paragraphs]
|
| 39 |
+
return "\n".join(paragraphs)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
def split_text_into_chunks(text, max_chunk_length):
|
| 43 |
chunks = []
|
| 44 |
current_chunk = ""
|
|
|
|
| 91 |
_, file_ext = os.path.splitext(file_name)
|
| 92 |
if "pdf" in file_ext:
|
| 93 |
sentence = load_pdf_file(uploaded_file)
|
| 94 |
+
elif "docx" in file_ext:
|
| 95 |
+
sentence = load_word_file(uploaded_file)
|
| 96 |
else:
|
| 97 |
sentence = load_text_file(uploaded_file)
|
| 98 |
st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
streamlit
|
| 2 |
#tensorflow
|
| 3 |
#tf-keras
|
|
|
|
| 1 |
+
python-docx
|
| 2 |
streamlit
|
| 3 |
#tensorflow
|
| 4 |
#tf-keras
|