File size: 2,961 Bytes
ff1b4c6
 
 
 
 
 
 
 
 
 
 
 
 
284588a
ff1b4c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e977829
 
 
 
 
 
 
 
ff1b4c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b36406
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone, Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
#from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

from langchain.document_loaders import DirectoryLoader
#from langchain.document_loaders import TextLoader




#print(os.getcwd())
#print(os.listdir())


txt_loader = DirectoryLoader('www.mlconsultants.net/', glob = "**/*.txt")
documents = txt_loader.load()

#Gibt es bessere Lösungen des Preprocessings?
def remove_newlines(serie):
    serie = serie.replace('\n', ' ')
    serie = serie.replace('\\n', ' ')
    serie = serie.replace('  ', ' ')
    serie = serie.replace('  ', ' ')
    return serie

#Anwenden Preprocessing
for x in range(0, len(documents)):
  documents[x].page_content = remove_newlines(documents[x].page_content)


#documents[1]
#print(len(documents))
#documents[6]

#Splittet 
text_splitter = CharacterTextSplitter(chunk_size =1000, chunk_overlap = 20, separator=" ")
documents = text_splitter.split_documents(documents)
print(len(documents))
documents[0]


embeddings = OpenAIEmbeddings()


#Documente werden in Embeddings umgewandelt
vectorstore = Chroma.from_documents(documents, embeddings) # Welche Kosten verursacht es  ?


#Similarity search by top6
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs = {"k":5})
qa = ConversationalRetrievalChain.from_llm(OpenAI(model_name = "text-davinci-003"), retriever)
chat_history = []



import gradio as gr
with gr.Blocks() as demo:
    gr.Markdown(
        """
    # M&L Projekt: Human Search - Wissensdatenbanken in natürlicher Sprache anfragen und erhalten!
    Ich bin eine generative KI und mir steht das Wissen der M&L Webseite bis Mai 2023 zur Verfügung. 
    Frage mich etwas! Z.b. Was ist die Targetmatrix? Wen oder was sponsort die M&L? Wann wurde die M&L gegründet? Uvm.
    Bitte beachte: Ich bin ein erster Protoyp und mein Können wird noch ausgebaut!
    """
    )
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def respond(user_message, chat_history):
        print(user_message)
        #print(chat_history)
        if chat_history:
          chat_history = [tuple(sublist) for sublist in chat_history]
          print(chat_history)


        # Get response from QA chain
        response = qa({"question": user_message, "chat_history": chat_history})
        # Append user message and response to chat history
        chat_history.append((user_message, response["answer"]))
        print(chat_history)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)


demo.launch(debug=True)