viboognesh commited on
Commit
84015e9
·
verified ·
1 Parent(s): 5fa1281

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +27 -18
main.py CHANGED
@@ -1,13 +1,12 @@
1
  from fastapi import FastAPI, File, UploadFile, Depends
2
  from fastapi.middleware.cors import CORSMiddleware
 
3
  from typing import List, Dict, Any
4
  from io import BytesIO, StringIO
5
  from docx import Document
6
  from langchain.docstore.document import Document as langchain_Document
7
  from PyPDF2 import PdfReader
8
-
9
  import csv
10
- from dotenv import load_dotenv
11
 
12
  from langchain.prompts import ChatPromptTemplate, PromptTemplate
13
  from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
@@ -17,31 +16,34 @@ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
17
  from langchain_community.vectorstores import Chroma
18
  from langchain.chains import ConversationalRetrievalChain
19
 
 
 
20
  load_dotenv()
21
 
 
22
  class Document_Processor:
23
- def __init__(self , file_details: List[Dict[Any, str]]):
24
  self.file_details = file_details
25
 
26
  def get_docs(self) -> List[langchain_Document]:
27
  docs = []
28
  for file_detail in self.file_details:
29
  if file_detail["name"].endswith(".txt"):
30
- docs.extend(self.get_txt_docs(file_detail))
31
 
32
  elif file_detail["name"].endswith(".csv"):
33
- docs.extend(self.get_csv_docs(file_detail))
34
 
35
  elif file_detail["name"].endswith(".docx"):
36
- docs.extend(self.get_docx_docs(file_detail))
37
 
38
  elif file_detail["name"].endswith(".pdf"):
39
- docs.extend(self.get_pdf_docs(file_detail))
40
 
41
  return docs
42
 
43
  @staticmethod
44
- def get_txt_docs(self, file_detail: Dict[str, Any]) -> List[langchain_Document]:
45
  text = file_detail["content"].decode("utf-8")
46
  source = file_detail["name"]
47
  text_splitter = RecursiveCharacterTextSplitter(
@@ -53,7 +55,7 @@ class Document_Processor:
53
  return text_docs
54
 
55
  @staticmethod
56
- def get_csv_docs(self, file_detail: Dict[str, Any]) -> List[langchain_Document]:
57
  csv_data = file_detail["content"]
58
  source = file_detail["name"]
59
  csv_string = csv_data.decode("utf-8")
@@ -73,7 +75,7 @@ class Document_Processor:
73
  return csv_docs
74
 
75
  @staticmethod
76
- def get_pdf_docs(self, file_detail: Dict[str, Any]) -> List[langchain_Document]:
77
  pdf_content = BytesIO(file_detail["content"])
78
  source = file_detail["name"]
79
 
@@ -82,27 +84,32 @@ class Document_Processor:
82
  for page in reader.pages:
83
  pdf_text += page.extract_text() + "\n"
84
 
85
- pdf_docs = RecursiveCharacterTextSplitter.create_documents(
 
 
 
86
  texts=[pdf_text], metadatas=[{"source": source}]
87
  )
88
  return pdf_docs
89
 
90
  @staticmethod
91
- def get_docx_docs(self, file_detail: Dict[str, Any]) -> List[langchain_Document]:
92
  docx_content = BytesIO(file_detail["content"])
93
  source = file_detail["name"]
94
 
95
  document = Document(docx_content)
96
  docx_text = " ".join([paragraph.text for paragraph in document.paragraphs])
97
 
98
- docx_docs = RecursiveCharacterTextSplitter.create_documents(
 
 
 
99
  [docx_text], metadatas=[{"source": source}]
100
  )
101
  return docx_docs
102
 
103
 
104
  class Conversational_Chain:
105
-
106
  def __init__(self, file_details: List[Dict[Any, str]]):
107
  self.llm_model = ChatOpenAI()
108
  self.embeddings = OpenAIEmbeddings()
@@ -132,7 +139,7 @@ class Conversational_Chain:
132
  return conversation_chain
133
 
134
  @staticmethod
135
- def get_document_prompt(self) -> PromptTemplate:
136
  document_template = """Document Content:{page_content}
137
  Document Path: {source}"""
138
  return PromptTemplate(
@@ -141,7 +148,7 @@ class Conversational_Chain:
141
  )
142
 
143
  @staticmethod
144
- def get_question_generator_prompt(self) -> PromptTemplate:
145
  question_generator_template = """Combine the chat history and follow up question into
146
  a standalone question.\n Chat History: {chat_history}\n
147
  Follow up question: {question}
@@ -149,7 +156,7 @@ class Conversational_Chain:
149
  return PromptTemplate.from_template(question_generator_template)
150
 
151
  @staticmethod
152
- def get_final_prompt(self) -> ChatPromptTemplate:
153
  final_prompt_template = """Answer question based on the context and chat_history.
154
  If you cannot find answers, ask more related questions from the user.
155
  Use only the basename of the file path as name of the documents.
@@ -201,7 +208,9 @@ async def upload_files(files: List[UploadFile] = File(...)):
201
  details = {"content": content, "name": name}
202
  file_details.append(details)
203
 
204
- app.state.conversational_chain = Conversational_Chain(file_details).create_conversational_chain()
 
 
205
  print("conversational_chain_manager created")
206
  return {"message": "ConversationalRetrievalChain is created. Please ask questions."}
207
 
 
1
  from fastapi import FastAPI, File, UploadFile, Depends
2
  from fastapi.middleware.cors import CORSMiddleware
3
+
4
  from typing import List, Dict, Any
5
  from io import BytesIO, StringIO
6
  from docx import Document
7
  from langchain.docstore.document import Document as langchain_Document
8
  from PyPDF2 import PdfReader
 
9
  import csv
 
10
 
11
  from langchain.prompts import ChatPromptTemplate, PromptTemplate
12
  from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
 
16
  from langchain_community.vectorstores import Chroma
17
  from langchain.chains import ConversationalRetrievalChain
18
 
19
+ from dotenv import load_dotenv
20
+
21
  load_dotenv()
22
 
23
+
24
  class Document_Processor:
25
+ def __init__(self, file_details: List[Dict[Any, str]]):
26
  self.file_details = file_details
27
 
28
  def get_docs(self) -> List[langchain_Document]:
29
  docs = []
30
  for file_detail in self.file_details:
31
  if file_detail["name"].endswith(".txt"):
32
+ docs.extend(self.get_txt_docs(file_detail=file_detail))
33
 
34
  elif file_detail["name"].endswith(".csv"):
35
+ docs.extend(self.get_csv_docs(file_detail=file_detail))
36
 
37
  elif file_detail["name"].endswith(".docx"):
38
+ docs.extend(self.get_docx_docs(file_detail=file_detail))
39
 
40
  elif file_detail["name"].endswith(".pdf"):
41
+ docs.extend(self.get_pdf_docs(file_detail=file_detail))
42
 
43
  return docs
44
 
45
  @staticmethod
46
+ def get_txt_docs(file_detail: Dict[str, Any]) -> List[langchain_Document]:
47
  text = file_detail["content"].decode("utf-8")
48
  source = file_detail["name"]
49
  text_splitter = RecursiveCharacterTextSplitter(
 
55
  return text_docs
56
 
57
  @staticmethod
58
+ def get_csv_docs(file_detail: Dict[str, Any]) -> List[langchain_Document]:
59
  csv_data = file_detail["content"]
60
  source = file_detail["name"]
61
  csv_string = csv_data.decode("utf-8")
 
75
  return csv_docs
76
 
77
  @staticmethod
78
+ def get_pdf_docs(file_detail: Dict[str, Any]) -> List[langchain_Document]:
79
  pdf_content = BytesIO(file_detail["content"])
80
  source = file_detail["name"]
81
 
 
84
  for page in reader.pages:
85
  pdf_text += page.extract_text() + "\n"
86
 
87
+ text_splitter = RecursiveCharacterTextSplitter(
88
+ chunk_size=1000, chunk_overlap=100
89
+ )
90
+ pdf_docs = text_splitter.create_documents(
91
  texts=[pdf_text], metadatas=[{"source": source}]
92
  )
93
  return pdf_docs
94
 
95
  @staticmethod
96
+ def get_docx_docs(file_detail: Dict[str, Any]) -> List[langchain_Document]:
97
  docx_content = BytesIO(file_detail["content"])
98
  source = file_detail["name"]
99
 
100
  document = Document(docx_content)
101
  docx_text = " ".join([paragraph.text for paragraph in document.paragraphs])
102
 
103
+ text_splitter = RecursiveCharacterTextSplitter(
104
+ chunk_size=1000, chunk_overlap=100
105
+ )
106
+ docx_docs = text_splitter.create_documents(
107
  [docx_text], metadatas=[{"source": source}]
108
  )
109
  return docx_docs
110
 
111
 
112
  class Conversational_Chain:
 
113
  def __init__(self, file_details: List[Dict[Any, str]]):
114
  self.llm_model = ChatOpenAI()
115
  self.embeddings = OpenAIEmbeddings()
 
139
  return conversation_chain
140
 
141
  @staticmethod
142
+ def get_document_prompt() -> PromptTemplate:
143
  document_template = """Document Content:{page_content}
144
  Document Path: {source}"""
145
  return PromptTemplate(
 
148
  )
149
 
150
  @staticmethod
151
+ def get_question_generator_prompt() -> PromptTemplate:
152
  question_generator_template = """Combine the chat history and follow up question into
153
  a standalone question.\n Chat History: {chat_history}\n
154
  Follow up question: {question}
 
156
  return PromptTemplate.from_template(question_generator_template)
157
 
158
  @staticmethod
159
+ def get_final_prompt() -> ChatPromptTemplate:
160
  final_prompt_template = """Answer question based on the context and chat_history.
161
  If you cannot find answers, ask more related questions from the user.
162
  Use only the basename of the file path as name of the documents.
 
208
  details = {"content": content, "name": name}
209
  file_details.append(details)
210
 
211
+ app.state.conversational_chain = Conversational_Chain(
212
+ file_details
213
+ ).create_conversational_chain()
214
  print("conversational_chain_manager created")
215
  return {"message": "ConversationalRetrievalChain is created. Please ask questions."}
216