Rohan12345 commited on
Commit
a853a16
·
verified ·
1 Parent(s): c4af46b

Update app.py

Browse files

hello commit 1

Files changed (1) hide show
  1. app.py +45 -30
app.py CHANGED
@@ -1,46 +1,61 @@
1
  import gradio as gr
2
- import os
3
-
4
  from langchain_community.document_loaders import PyPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.vectorstores import Chroma
7
- from langchain.chains import ConversationalRetrievalChain
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.llms import HuggingFacePipeline
10
- from langchain.chains import ConversationChain
11
- from langchain.memory import ConversationBufferMemory
12
- from langchain_community.llms import HuggingFaceEndpoint
13
-
14
  from pathlib import Path
15
- import chromadb
16
  from unidecode import unidecode
17
 
18
- from transformers import AutoTokenizer
19
- import transformers
20
- import torch
21
- import tqdm
22
- import accelerate
23
- import re
24
-
25
- list_llm = ["HuggingFaceH4/zephyr-7b-beta", "mistralai/Mistral-7B-Instruct-v0.2"]
26
- list_llm_simple = [os.path.basename(llm) for llm in list_llm]
27
-
28
  def summarize_document(document_text):
29
  # Your summarization code here
30
  summary = "The document covers various topics such as X, Y, and Z, providing detailed insights into each aspect."
31
  return summary
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def demo():
34
- with gr.Blocks(theme="base") as demo:
35
- gr.Markdown("<center><h2>PDF Summarizer</center></h2>")
36
-
37
- text_input = gr.Textbox(placeholder="Paste your document text here", label="Document Text")
38
- summarize_btn = gr.Button("Summarize")
39
- summary_output = gr.Textbox(readonly=True, label="Summary")
40
-
41
- summarize_btn.click(summarize_document, inputs=[text_input], outputs=[summary_output])
42
-
43
- demo.launch()
44
 
45
  if __name__ == "__main__":
46
  demo()
 
1
  import gradio as gr
 
 
2
  from langchain_community.document_loaders import PyPDFLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import Chroma
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
 
 
 
 
 
 
6
  from pathlib import Path
 
7
  from unidecode import unidecode
8
 
 
 
 
 
 
 
 
 
 
 
9
  def summarize_document(document_text):
10
  # Your summarization code here
11
  summary = "The document covers various topics such as X, Y, and Z, providing detailed insights into each aspect."
12
  return summary
13
 
14
+ def initialize_database(list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()):
15
+ list_file_path = [x.name for x in list_file_obj if x is not None]
16
+ collection_name = create_collection_name(list_file_path[0])
17
+ doc_splits = load_doc(list_file_path, chunk_size, chunk_overlap)
18
+ vector_db = create_db(doc_splits, collection_name)
19
+ return vector_db, collection_name, "Complete!"
20
+
21
+ def load_doc(list_file_path, chunk_size, chunk_overlap):
22
+ loaders = [PyPDFLoader(x) for x in list_file_path]
23
+ pages = []
24
+ for loader in loaders:
25
+ pages.extend(loader.load())
26
+ text_splitter = RecursiveCharacterTextSplitter(
27
+ chunk_size = chunk_size,
28
+ chunk_overlap = chunk_overlap)
29
+ doc_splits = text_splitter.split_documents(pages)
30
+ return doc_splits
31
+
32
+ def create_db(splits, collection_name):
33
+ embedding = HuggingFaceEmbeddings()
34
+ new_client = chromadb.EphemeralClient()
35
+ vectordb = Chroma.from_documents(
36
+ documents=splits,
37
+ embedding=embedding,
38
+ client=new_client,
39
+ collection_name=collection_name,
40
+ )
41
+ return vectordb
42
+
43
+ def create_collection_name(filepath):
44
+ collection_name = Path(filepath).stem
45
+ collection_name = unidecode(collection_name)
46
+ collection_name = re.sub('[^A-Za-z0-9]+', '-', collection_name)
47
+ collection_name = collection_name[:50]
48
+ if len(collection_name) < 3:
49
+ collection_name = collection_name + 'xyz'
50
+ if not collection_name[0].isalnum():
51
+ collection_name = 'A' + collection_name[1:]
52
+ if not collection_name[-1].isalnum():
53
+ collection_name = collection_name[:-1] + 'Z'
54
+ return collection_name
55
+
56
  def demo():
57
+ with gr.Interface(summarize_document, inputs="text", outputs="text", title="PDF Summarizer") as iface:
58
+ iface.launch()
 
 
 
 
 
 
 
 
59
 
60
  if __name__ == "__main__":
61
  demo()