Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import RetrievalQA | |
| from langchain_groq import ChatGroq | |
| class ChatbotModel: | |
| def __init__(self): | |
| # Initialize the environment variable for the GROQ API Key | |
| os.environ["GROQ_API_KEY"] = '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' | |
| # Initialize embeddings | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cpu'}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| # Initialize the chat model | |
| self.llm = ChatGroq( | |
| model='llama3-70b-8192', | |
| temperature=0.5, | |
| max_tokens=None, | |
| timeout=None, | |
| max_retries=2, | |
| ) | |
| # Initialize memory for conversation | |
| self.memory = ConversationBufferMemory(memory_key="history", input_key="question") | |
| # Create the QA chain prompt template | |
| self.template = """You are an intelligent educational assistant specialized in handling queries about documents in both English and Gujarati languages. You have been provided with OCR-processed text from {document_type} that contains important educational information. | |
| Core Responsibilities: | |
| 1. Language Processing: | |
| - Identify the language of the user's query (English or Gujarati) | |
| - Respond in the same language as the query | |
| - If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology | |
| - For technical terms, provide both English and Gujarati versions when relevant | |
| 2. Document Understanding: | |
| - Analyze the OCR-processed text from the uploaded {document_type} | |
| - Account for potential OCR errors or misinterpretations | |
| - Focus on extracting accurate information despite possible OCR imperfections | |
| 3. Response Guidelines: | |
| - Provide direct, clear answers based solely on the document content | |
| - If information is unclear due to OCR quality, mention this limitation | |
| - For numerical data (dates, percentages, marks), double-check accuracy before responding | |
| - If information is not found in the document, clearly state: "This information is not present in the uploaded document" | |
| 4. Educational Context: | |
| - Maintain focus on educational queries related to the document content | |
| - For admission-related queries, emphasize important deadlines and requirements | |
| - For scholarship information, highlight eligibility criteria and application processes | |
| - For course-related queries, provide detailed, accurate information from the document | |
| 5. Response Format: | |
| - Structure responses clearly with relevant subpoints when necessary | |
| - For complex information, break down the answer into digestible parts | |
| - Include relevant reference points from the document when applicable | |
| - Format numerical data and dates clearly | |
| 6. Quality Control: | |
| - Verify that responses align with the document content | |
| - Don't make assumptions beyond the provided information | |
| - If multiple interpretations are possible due to OCR quality, mention all possibilities | |
| - Maintain consistency in terminology throughout the conversation | |
| Important Rules: | |
| - Never make up information not present in the document | |
| - Don't combine information from previous conversations or external knowledge | |
| - Always indicate if certain parts of the document are unclear due to OCR quality | |
| - Maintain professional tone while being accessible to students and parents | |
| - If the query is out of scope of the uploaded document, politely redirect to relevant official sources | |
| Context from uploaded document: | |
| {context} | |
| Chat History: | |
| {history} | |
| Current Question: {question} | |
| Assistant: Let me provide a clear and accurate response based on the uploaded document content... | |
| """ | |
| self.QA_CHAIN_PROMPT = PromptTemplate( | |
| input_variables=["history", "context", "question"], | |
| template=self.template | |
| ) | |
| self.db1 = None | |
| self.qa_chain = None | |
| def ocr_image(self, image_path, language='eng+guj'): | |
| img = Image.open(image_path) | |
| return pytesseract.image_to_string(img, lang=language) | |
| def ocr_pdf(self, pdf_path, language='eng+guj'): | |
| images = convert_from_path(pdf_path) | |
| return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images]) | |
| def process_file(self, uploaded_file): | |
| """Process an uploaded file and initialize the QA chain.""" | |
| _, file_extension = os.path.splitext(uploaded_file.name) | |
| file_extension = file_extension.lower() | |
| # Temporarily save the file for processing | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: | |
| temp_file.write(uploaded_file.read()) | |
| temp_path = temp_file.name | |
| # OCR processing based on file type | |
| if file_extension == '.pdf': | |
| raw_text = self.ocr_pdf(temp_path, language='guj+eng') | |
| elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: | |
| raw_text = self.ocr_image(temp_path, language='guj+eng') | |
| else: | |
| return "Unsupported file format." | |
| # Split text into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| text_chunks = text_splitter.split_text(raw_text) | |
| # Create vector store and initialize QA chain | |
| self.db1 = FAISS.from_documents(text_chunks, self.embeddings) | |
| self.qa_chain = RetrievalQA.from_chain_type( | |
| self.llm, | |
| retriever=self.db1.as_retriever(), | |
| chain_type='stuff', | |
| verbose=True, | |
| chain_type_kwargs={ | |
| "verbose": True, | |
| "prompt": self.QA_CHAIN_PROMPT, | |
| "memory": self.memory | |
| } | |
| ) | |
| return "File processed successfully!" | |
| def get_response(self, user_input): | |
| """Generate response to the user input question.""" | |
| if not self.qa_chain: | |
| return "Please upload and process a file before asking questions." | |
| response = self.qa_chain({"query": user_input}) | |
| return response["result"] | |
| # Initialize the chatbot | |
| chatbot = ChatbotModel() | |
| # Define Gradio interface functions | |
| def upload_and_process(file): | |
| return chatbot.process_file(file) | |
| def ask_question(question): | |
| return chatbot.get_response(question) | |
| # Set up Gradio interface | |
| interface = gr.Blocks() | |
| with interface: | |
| gr.Markdown("# Educational Chatbot with Document Analysis") | |
| with gr.Row(): | |
| file_upload = gr.File(label="Upload PDF or Image") | |
| upload_btn = gr.Button("Process File") | |
| output = gr.Textbox(label="File Processing Status") | |
| with gr.Row(): | |
| question_box = gr.Textbox(label="Ask a Question") | |
| ask_btn = gr.Button("Submit") | |
| answer = gr.Textbox(label="Answer") | |
| # Connect buttons to functions | |
| upload_btn.click(upload_and_process, inputs=file_upload, outputs=output) | |
| ask_btn.click(ask_question, inputs=question_box, outputs=answer) | |
| # Launch Gradio interface | |
| interface.launch() | |