tregu0458 commited on
Commit
a821a32
·
verified ·
1 Parent(s): d62d612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  from langchain_community.document_loaders import YoutubeLoader
3
  from langchain.text_splitter import TokenTextSplitter
 
4
 
5
  max_textboxes = 5
6
 
@@ -13,20 +14,24 @@ def process_youtube_url(url, language):
13
  )
14
  docs = loader.load()
15
  text = str(docs)
16
- char_count = len(text)
17
-
18
- text_splitter = TokenTextSplitter(chunk_size=10000, chunk_overlap=0)
 
 
19
  chunks = text_splitter.split_text(text)
20
-
21
  output_textboxes = [chunk for i, chunk in enumerate(chunks)]
22
  output_textboxes += ["" for _ in range(max_textboxes - len(chunks))]
23
 
24
- return *output_textboxes,[], text, char_count
 
25
  except Exception as e:
26
  error_msg = str(e)
27
  available_languages = extract_available_languages(error_msg)
28
  recommended_language = extract_recommended_language(error_msg)
29
- return *[error_msg for _ in range(max_textboxes)],available_languages, recommended_language, 0
 
30
 
31
  def extract_available_languages(error_msg):
32
  languages = []
 
1
  import gradio as gr
2
  from langchain_community.document_loaders import YoutubeLoader
3
  from langchain.text_splitter import TokenTextSplitter
4
+ from langchain.embeddings import OpenAIEmbeddings
5
 
6
  max_textboxes = 5
7
 
 
14
  )
15
  docs = loader.load()
16
  text = str(docs)
17
+
18
+ embeddings = OpenAIEmbeddings()
19
+ token_count = len(embeddings.encode(text))
20
+
21
+ text_splitter = TokenTextSplitter(chunk_size=20_000, chunk_overlap=0)
22
  chunks = text_splitter.split_text(text)
23
+
24
  output_textboxes = [chunk for i, chunk in enumerate(chunks)]
25
  output_textboxes += ["" for _ in range(max_textboxes - len(chunks))]
26
 
27
+ return *output_textboxes, [], text, token_count
28
+
29
  except Exception as e:
30
  error_msg = str(e)
31
  available_languages = extract_available_languages(error_msg)
32
  recommended_language = extract_recommended_language(error_msg)
33
+ return *[error_msg for _ in range(max_textboxes)], available_languages, recommended_language, 0
34
+
35
 
36
  def extract_available_languages(error_msg):
37
  languages = []