Spaces:

KrishP-12
/

docacpc

Sleeping

App Files Files Community

KrishP-12 commited on Nov 15, 2024

Commit

2de3e63

verified ·

1 Parent(s): aafc661

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -7

app.py CHANGED Viewed

@@ -12,17 +12,19 @@ from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 from langchain_groq import ChatGroq
 class ChatbotModel:
     def __init__(self):
         os.environ["GROQ_API_KEY"] = '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'
         self.embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
             model_kwargs={'device': 'cpu'},
             encode_kwargs={'normalize_embeddings': True}
         )
         self.llm = ChatGroq(
             model='llama3-70b-8192',
             temperature=0.5,
@@ -31,14 +33,70 @@ class ChatbotModel:
             max_retries=2,
         )
         self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
-        self.template = """You are an intelligent assistant... (Rest of your prompt as is)"""
         self.QA_CHAIN_PROMPT = PromptTemplate(
             input_variables=["history", "context", "question"],
             template=self.template
         )
         self.db1 = None
         self.qa_chain = None
@@ -51,13 +109,16 @@ class ChatbotModel:
         return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
     def process_file(self, uploaded_file):
         _, file_extension = os.path.splitext(uploaded_file.name)
         file_extension = file_extension.lower()
         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
             temp_file.write(uploaded_file.read())
             temp_path = temp_file.name
         if file_extension == '.pdf':
             raw_text = self.ocr_pdf(temp_path, language='guj+eng')
         elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
@@ -65,9 +126,11 @@ class ChatbotModel:
         else:
             return "Unsupported file format."
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         text_chunks = text_splitter.split_text(raw_text)
         self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
         self.qa_chain = RetrievalQA.from_chain_type(
             self.llm,
@@ -80,27 +143,26 @@ class ChatbotModel:
                 "memory": self.memory
             }
         )
         return "File processed successfully!"
     def get_response(self, user_input):
         if not self.qa_chain:
             return "Please upload and process a file before asking questions."
         response = self.qa_chain({"query": user_input})
         return response["result"]
 chatbot = ChatbotModel()
 def upload_and_process(file):
     return chatbot.process_file(file)
 def ask_question(question):
     return chatbot.get_response(question)
 interface = gr.Blocks()
 with interface:
@@ -115,7 +177,9 @@ with interface:
         ask_btn = gr.Button("Submit")
     answer = gr.Textbox(label="Answer")
     upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
     ask_btn.click(ask_question, inputs=question_box, outputs=answer)
 interface.launch()

 from langchain.chains import RetrievalQA
 from langchain_groq import ChatGroq
 class ChatbotModel:
     def __init__(self):
+        # Initialize the environment variable for the GROQ API Key
         os.environ["GROQ_API_KEY"] = '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'
+        # Initialize embeddings
         self.embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
             model_kwargs={'device': 'cpu'},
             encode_kwargs={'normalize_embeddings': True}
         )
+        # Initialize the chat model
         self.llm = ChatGroq(
             model='llama3-70b-8192',
             temperature=0.5,
             max_retries=2,
         )
+        # Initialize memory for conversation
         self.memory = ConversationBufferMemory(memory_key="history", input_key="question")
+        # Create the QA chain prompt template
+        self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information.
+        Core Responsibilities:
+        1. Language Processing:
+           - Identify the language of the user's query (English or Gujarati)
+           - Respond in the same language as the query
+           - If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
+           - For technical terms, provide both English and Gujarati versions when relevant
+        2. Document Understanding:
+           - Analyze the OCR-processed text from the uploaded {document_type}
+           - Account for potential OCR errors or misinterpretations
+           - Focus on extracting accurate information despite possible OCR imperfections
+        3. Response Guidelines:
+           - Provide direct, clear answers based solely on the document content
+           - If information is unclear due to OCR quality, mention this limitation
+           - For numerical data (dates, percentages, marks), double-check accuracy before responding
+           - If information is not found in the document, clearly state: "This information is not present in the uploaded document"
+        4. Educational Context:
+           - Maintain focus on educational queries related to the document content
+           - For admission-related queries, emphasize important deadlines and requirements
+           - For scholarship information, highlight eligibility criteria and application processes
+           - For course-related queries, provide detailed, accurate information from the document
+        5. Response Format:
+           - Structure responses clearly with relevant subpoints when necessary
+           - For complex information, break down the answer into digestible parts
+           - Include relevant reference points from the document when applicable
+           - Format numerical data and dates clearly
+        6. Quality Control:
+           - Verify that responses align with the document content
+           - Don't make assumptions beyond the provided information
+           - If multiple interpretations are possible due to OCR quality, mention all possibilities
+           - Maintain consistency in terminology throughout the conversation
+        Important Rules:
+        - Never make up information not present in the document
+        - Don't combine information from previous conversations or external knowledge
+        - Always indicate if certain parts of the document are unclear due to OCR quality
+        - Maintain professional tone while being accessible to students and parents
+        - If the query is out of scope of the uploaded document, politely redirect to relevant official sources
+        Context from uploaded document:
+        {context}
+        Chat History:
+        {history}
+        Current Question: {question}
+        Assistant: Let me provide a clear and accurate response based on the uploaded document content...
+        """
         self.QA_CHAIN_PROMPT = PromptTemplate(
             input_variables=["history", "context", "question"],
             template=self.template
         )
         self.db1 = None
         self.qa_chain = None
         return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images])
     def process_file(self, uploaded_file):
+        """Process an uploaded file and initialize the QA chain."""
         _, file_extension = os.path.splitext(uploaded_file.name)
         file_extension = file_extension.lower()
+        # Temporarily save the file for processing
         with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
             temp_file.write(uploaded_file.read())
             temp_path = temp_file.name
+        # OCR processing based on file type
         if file_extension == '.pdf':
             raw_text = self.ocr_pdf(temp_path, language='guj+eng')
         elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
         else:
             return "Unsupported file format."
+        # Split text into chunks
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         text_chunks = text_splitter.split_text(raw_text)
+        # Create vector store and initialize QA chain
         self.db1 = FAISS.from_documents(text_chunks, self.embeddings)
         self.qa_chain = RetrievalQA.from_chain_type(
             self.llm,
                 "memory": self.memory
             }
         )
         return "File processed successfully!"
     def get_response(self, user_input):
+        """Generate response to the user input question."""
         if not self.qa_chain:
             return "Please upload and process a file before asking questions."
         response = self.qa_chain({"query": user_input})
         return response["result"]
+# Initialize the chatbot
 chatbot = ChatbotModel()
+# Define Gradio interface functions
 def upload_and_process(file):
     return chatbot.process_file(file)
 def ask_question(question):
     return chatbot.get_response(question)
+# Set up Gradio interface
 interface = gr.Blocks()
 with interface:
         ask_btn = gr.Button("Submit")
     answer = gr.Textbox(label="Answer")
+    # Connect buttons to functions
     upload_btn.click(upload_and_process, inputs=file_upload, outputs=output)
     ask_btn.click(ask_question, inputs=question_box, outputs=answer)
+# Launch Gradio interface
 interface.launch()