ferguch9 commited on
Commit
19977fc
·
1 Parent(s): 20c1d8c

feat: support for word docs

Browse files
Files changed (2) hide show
  1. app.py +9 -0
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import os
3
  import PyPDF2
@@ -32,6 +33,12 @@ def load_pdf_file(file):
32
  return pdf_text
33
 
34
 
 
 
 
 
 
 
35
  def split_text_into_chunks(text, max_chunk_length):
36
  chunks = []
37
  current_chunk = ""
@@ -84,6 +91,8 @@ def main():
84
  _, file_ext = os.path.splitext(file_name)
85
  if "pdf" in file_ext:
86
  sentence = load_pdf_file(uploaded_file)
 
 
87
  else:
88
  sentence = load_text_file(uploaded_file)
89
  st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
 
1
+ import docx
2
  import streamlit as st
3
  import os
4
  import PyPDF2
 
33
  return pdf_text
34
 
35
 
36
+ def load_word_file(file):
37
+ doc = docx.Document(file)
38
+ paragraphs = [p.text for p in doc.paragraphs]
39
+ return "\n".join(paragraphs)
40
+
41
+
42
  def split_text_into_chunks(text, max_chunk_length):
43
  chunks = []
44
  current_chunk = ""
 
91
  _, file_ext = os.path.splitext(file_name)
92
  if "pdf" in file_ext:
93
  sentence = load_pdf_file(uploaded_file)
94
+ elif "docx" in file_ext:
95
+ sentence = load_word_file(uploaded_file)
96
  else:
97
  sentence = load_text_file(uploaded_file)
98
  st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  streamlit
2
  #tensorflow
3
  #tf-keras
 
1
+ python-docx
2
  streamlit
3
  #tensorflow
4
  #tf-keras