Sourbh510 commited on
Commit
3bf116e
·
verified ·
1 Parent(s): ee43544

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -9
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import gradio as gr
2
  import faiss
3
  import numpy as np
 
 
 
4
 
5
  from sentence_transformers import SentenceTransformer
6
  from transformers import pipeline
@@ -21,14 +24,44 @@ gen_model=AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)
21
  chunks_store=[]
22
  index=None
23
 
 
24
 
25
- def build_kb(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  global chunks_store,index
28
 
 
 
29
  chunks=[
30
- text[i:i+300]
31
- for i in range(0,len(text),300)
32
  ]
33
 
34
  chunks_store=chunks
@@ -38,12 +71,12 @@ def build_kb(text):
38
  dim=embeddings.shape[1]
39
 
40
  index=faiss.IndexFlatL2(dim)
 
41
  index.add(
42
- np.array(embeddings).astype("float32")
43
  )
44
 
45
- return "Knowledge Base Created"
46
-
47
 
48
  def ask_question(question):
49
 
@@ -107,9 +140,8 @@ Ask questions over your own documents
107
 
108
  with gr.Tab("Build Knowledge Base"):
109
 
110
- doc=gr.Textbox(
111
- lines=12,
112
- label="Paste Document Text"
113
  )
114
 
115
  status=gr.Textbox(label="Status")
 
1
  import gradio as gr
2
  import faiss
3
  import numpy as np
4
+ import fitz
5
+ import docx
6
+ from pptx import Presentation
7
 
8
  from sentence_transformers import SentenceTransformer
9
  from transformers import pipeline
 
24
  chunks_store=[]
25
  index=None
26
 
27
+ def extract_text(file):
28
 
29
+ name=file.name.lower()
30
+
31
+ text=""
32
+
33
+ if name.endswith(".pdf"):
34
+ pdf=fitz.open(file.name)
35
+ for page in pdf:
36
+ text += page.get_text()
37
+
38
+ elif name.endswith(".docx"):
39
+ doc=docx.Document(file.name)
40
+ for p in doc.paragraphs:
41
+ text += p.text + "\n"
42
+
43
+ elif name.endswith(".pptx"):
44
+ prs=Presentation(file.name)
45
+
46
+ for slide in prs.slides:
47
+ for shape in slide.shapes:
48
+ if hasattr(shape,"text"):
49
+ text += shape.text + "\n"
50
+
51
+ else:
52
+ text="Unsupported file format"
53
+
54
+ return text
55
+
56
+ def build_kb(file):
57
 
58
  global chunks_store,index
59
 
60
+ text=extract_text(file)
61
+
62
  chunks=[
63
+ text[i:i+500]
64
+ for i in range(0,len(text),500)
65
  ]
66
 
67
  chunks_store=chunks
 
71
  dim=embeddings.shape[1]
72
 
73
  index=faiss.IndexFlatL2(dim)
74
+
75
  index.add(
76
+ np.array(embeddings).astype("float32")
77
  )
78
 
79
+ return f"Knowledge Base Created with {len(chunks)} chunks"
 
80
 
81
  def ask_question(question):
82
 
 
140
 
141
  with gr.Tab("Build Knowledge Base"):
142
 
143
+ doc=gr.File(
144
+ label="Upload PDF / DOCX / PPTX"
 
145
  )
146
 
147
  status=gr.Textbox(label="Status")