cloud-sean commited on
Commit
39bdba6
·
0 Parent(s):

Duplicate from cloud-sean/AOAI-Form-Recognizer

Browse files
Files changed (3) hide show
  1. README.md +13 -0
  2. app.py +93 -0
  3. requirements.txt +159 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AOAI Form Recognizer
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: cloud-sean/AOAI-Form-Recognizer
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai, os
3
+ import tqdm
4
+ import time
5
+ from langchain.vectorstores import Chroma
6
+ from PyPDF2 import PdfReader
7
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain import VectorDBQA
10
+ from langchain.llms import AzureOpenAI
11
+ from langchain.chains import RetrievalQA
12
+ from langchain.chat_models import AzureChatOpenAI
13
+
14
+ # from langchain.chat_models import AzureChatOpenAI
15
+
16
+ os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
17
+ os.environ["OPENAI_API_VERSION"] = openai.api_version = "2023-03-15-preview"
18
+ os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
19
+ openai.api_key = os.environ["OPENAI_API_KEY"]
20
+
21
+
22
+ def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)):
23
+ reader = PdfReader(file)
24
+ number_of_pages = len(reader.pages)
25
+ pdf_text = ""
26
+ for page_number in range(number_of_pages):
27
+ page = reader.pages[page_number]
28
+ pdf_text += page.extract_text()
29
+ text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size = 1000,
31
+ chunk_overlap = 200,
32
+ length_function = len,)
33
+ texts = text_splitter.split_text(pdf_text)
34
+ for text in tqdm.tqdm(texts):
35
+ try:
36
+ response = openai.Embedding.create(
37
+ input=text,
38
+ engine="text-embedding-ada-002")
39
+ emb = response['data'][0]['embedding']
40
+ embeddings.append(emb)
41
+ except Exception as e:
42
+ print(e)
43
+ time.sleep(8)
44
+ response = openai.Embedding.create(
45
+ input=text,
46
+ engine="text-embedding-ada-002")
47
+ emb = response['data'][0]['embedding']
48
+ embeddings.append(emb)
49
+
50
+
51
+ azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
52
+ vectorstore = Chroma("collection", embedding_function=azure_embeddings)
53
+
54
+ vectorstore._collection.add(
55
+ ids= [f"doc_{i}" for i in range(len(texts))],
56
+ documents=texts,
57
+ embeddings=embeddings,
58
+ metadatas=[{"source": "source"} for text in texts])
59
+ qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name="Bartos", model_name='gpt-35-turbo' ), chain_type="stuff", retriever=vectorstore.as_retriever())
60
+ # qa = RetrievalQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
61
+
62
+ return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
63
+
64
+
65
+ def add_text(chatstate, query, qa):
66
+ # chain.run(input_documents=docs, question=query)
67
+ chatstate = chatstate + [(query, qa.run(query))]
68
+
69
+ return chatstate, chatstate, qa
70
+
71
+ with gr.Blocks(css="footer {visibility: hidden}", title='PDF - Q&A') as demo:
72
+ qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([])
73
+
74
+ with gr.Row(visible=False) as chat_row:
75
+ chatbot = gr.Chatbot()
76
+ with gr.Row(visible=False) as submit_row:
77
+ text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
78
+ chatstate = gr.State([])
79
+ text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa])
80
+
81
+ # set state
82
+ with gr.Column() as upload_column:
83
+
84
+ file = gr.File()
85
+ upload_btn = gr.Button("Upload")
86
+ output_text = gr.TextArea()
87
+ upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column])
88
+
89
+ with gr.Row():
90
+ gr.Markdown("`now with GPT-3.5 Turbo`")
91
+
92
+
93
+ demo.launch(enable_queue=True)
requirements.txt ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ appnope==0.1.3
7
+ argilla==1.3.0
8
+ asttokens==2.2.1
9
+ async-timeout==4.0.2
10
+ attrs==22.2.0
11
+ backcall==0.2.0
12
+ backoff==2.2.1
13
+ blinker==1.5
14
+ cachetools==5.3.0
15
+ certifi==2022.12.7
16
+ charset-normalizer==3.0.1
17
+ chromadb==0.3.21
18
+ click==8.1.3
19
+ clickhouse-connect==0.5.12
20
+ comm==0.1.2
21
+ contourpy==1.0.7
22
+ cycler==0.11.0
23
+ dataclasses-json==0.5.7
24
+ debugpy==1.6.6
25
+ decorator==5.1.1
26
+ Deprecated==1.2.13
27
+ duckdb==0.7.1
28
+ entrypoints==0.4
29
+ et-xmlfile==1.1.0
30
+ executing==1.2.0
31
+ fastapi==0.85.2
32
+ ffmpy==0.3.0
33
+ filelock==3.9.0
34
+ fonttools==4.38.0
35
+ frozenlist==1.3.3
36
+ fsspec==2023.1.0
37
+ gitdb==4.0.10
38
+ GitPython==3.1.31
39
+ gptcache==0.1.11
40
+ gradio==3.18.0
41
+ h11==0.14.0
42
+ hnswlib==0.7.0
43
+ httpcore==0.16.3
44
+ httptools==0.5.0
45
+ httpx==0.23.3
46
+ huggingface-hub==0.12.1
47
+ idna==3.4
48
+ importlib-metadata==6.0.0
49
+ importlib-resources==5.12.0
50
+ ipykernel==6.21.2
51
+ ipython==8.10.0
52
+ jedi==0.18.2
53
+ Jinja2==3.1.2
54
+ joblib==1.2.0
55
+ jsonschema==4.17.3
56
+ jupyter-client==8.0.3
57
+ jupyter-core==5.2.0
58
+ kiwisolver==1.4.4
59
+ langchain==0.0.139
60
+ linkify-it-py==1.0.3
61
+ lxml==4.9.2
62
+ lz4==4.3.2
63
+ markdown-it-py==2.1.0
64
+ MarkupSafe==2.1.2
65
+ marshmallow==3.19.0
66
+ marshmallow-enum==1.5.1
67
+ matplotlib==3.7.0
68
+ matplotlib-inline==0.1.6
69
+ mdit-py-plugins==0.3.4
70
+ mdurl==0.1.2
71
+ monotonic==1.6
72
+ multidict==6.0.4
73
+ mypy-extensions==1.0.0
74
+ nest-asyncio==1.5.6
75
+ nltk==3.8.1
76
+ numpy==1.21.6
77
+ openai==0.27.4
78
+ openapi-schema-pydantic==1.2.4
79
+ openpyxl==3.1.1
80
+ orjson==3.8.6
81
+ packaging==23.0
82
+ pandas==1.5.3
83
+ parso==0.8.3
84
+ pexpect==4.8.0
85
+ pickleshare==0.7.5
86
+ Pillow==9.4.0
87
+ platformdirs==3.0.0
88
+ posthog==2.5.0
89
+ prompt-toolkit==3.0.36
90
+ protobuf==3.20.3
91
+ psutil==5.9.4
92
+ ptyprocess==0.7.0
93
+ pure-eval==0.2.2
94
+ pyarrow==11.0.0
95
+ pycryptodome==3.17
96
+ pydantic==1.10.5
97
+ pydeck==0.8.0
98
+ pydub==0.25.1
99
+ Pygments==2.14.0
100
+ Pympler==1.0.1
101
+ pyparsing==3.0.9
102
+ pypdf2==3.0.1
103
+ pyrsistent==0.19.3
104
+ python-dateutil==2.8.2
105
+ python-docx==0.8.11
106
+ python-dotenv==0.21.1
107
+ python-magic==0.4.27
108
+ python-multipart==0.0.5
109
+ python-pptx==0.6.21
110
+ pytz==2022.7.1
111
+ pytz-deprecation-shim==0.1.0.post0
112
+ PyYAML==6.0
113
+ pyzmq==25.0.0
114
+ regex==2022.10.31
115
+ requests==2.28.2
116
+ rfc3986==1.5.0
117
+ rich==13.3.1
118
+ scikit-learn==1.2.1
119
+ scipy==1.10.0
120
+ semver==2.13.0
121
+ sentence-transformers==2.2.2
122
+ sentencepiece==0.1.97
123
+ six==1.16.0
124
+ smmap==5.0.0
125
+ sniffio==1.3.0
126
+ SQLAlchemy==1.4.46
127
+ stack-data==0.6.2
128
+ starlette==0.20.4
129
+ streamlit==1.18.1
130
+ tenacity==8.2.1
131
+ threadpoolctl==3.1.0
132
+ tiktoken==0.3.3
133
+ tokenizers==0.13.2
134
+ toml==0.10.2
135
+ toolz==0.12.0
136
+ torch==1.13.1
137
+ torchvision==0.14.1
138
+ tornado==6.2
139
+ tqdm==4.64.1
140
+ traitlets==5.9.0
141
+ transformers==4.26.1
142
+ typing-extensions==4.5.0
143
+ typing-inspect==0.8.0
144
+ tzdata==2022.7
145
+ tzlocal==4.2
146
+ uc-micro-py==1.0.1
147
+ unstructured==0.4.11
148
+ urllib3==1.26.14
149
+ uvicorn==0.18.3
150
+ uvloop==0.17.0
151
+ validators==0.20.0
152
+ watchfiles==0.18.1
153
+ wcwidth==0.2.6
154
+ websockets==10.4
155
+ wrapt==1.14.1
156
+ XlsxWriter==3.0.8
157
+ yarl==1.8.2
158
+ zipp==3.14.0
159
+ zstandard==0.19.0