Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import RetrievalQA | |
| from langchain_groq import ChatGroq | |
| class ChatbotModel: | |
| def __init__(self): | |
| os.environ["GROQ_API_KEY"] = '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' | |
| self.embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cpu'}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| self.llm = ChatGroq( | |
| model='llama3-70b-8192', | |
| temperature=0.5, | |
| max_tokens=None, | |
| timeout=None, | |
| max_retries=2, | |
| ) | |
| self.memory = ConversationBufferMemory(memory_key="history", input_key="question") | |
| self.template = """You are an intelligent assistant... (Rest of your prompt as is)""" | |
| self.QA_CHAIN_PROMPT = PromptTemplate( | |
| input_variables=["history", "context", "question"], | |
| template=self.template | |
| ) | |
| self.db1 = None | |
| self.qa_chain = None | |
| def ocr_image(self, image_path, language='eng+guj'): | |
| img = Image.open(image_path) | |
| return pytesseract.image_to_string(img, lang=language) | |
| def ocr_pdf(self, pdf_path, language='eng+guj'): | |
| images = convert_from_path(pdf_path) | |
| return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images]) | |
| def process_file(self, uploaded_file): | |
| _, file_extension = os.path.splitext(uploaded_file.name) | |
| file_extension = file_extension.lower() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: | |
| temp_file.write(uploaded_file.read()) | |
| temp_path = temp_file.name | |
| if file_extension == '.pdf': | |
| raw_text = self.ocr_pdf(temp_path, language='guj+eng') | |
| elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: | |
| raw_text = self.ocr_image(temp_path, language='guj+eng') | |
| else: | |
| return "Unsupported file format." | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| text_chunks = text_splitter.split_text(raw_text) | |
| self.db1 = FAISS.from_documents(text_chunks, self.embeddings) | |
| self.qa_chain = RetrievalQA.from_chain_type( | |
| self.llm, | |
| retriever=self.db1.as_retriever(), | |
| chain_type='stuff', | |
| verbose=True, | |
| chain_type_kwargs={ | |
| "verbose": True, | |
| "prompt": self.QA_CHAIN_PROMPT, | |
| "memory": self.memory | |
| } | |
| ) | |
| return "File processed successfully!" | |
| def get_response(self, user_input): | |
| if not self.qa_chain: | |
| return "Please upload and process a file before asking questions." | |
| response = self.qa_chain({"query": user_input}) | |
| return response["result"] | |
| chatbot = ChatbotModel() | |
| def upload_and_process(file): | |
| return chatbot.process_file(file) | |
| def ask_question(question): | |
| return chatbot.get_response(question) | |
| interface = gr.Blocks() | |
| with interface: | |
| gr.Markdown("# Educational Chatbot with Document Analysis") | |
| with gr.Row(): | |
| file_upload = gr.File(label="Upload PDF or Image") | |
| upload_btn = gr.Button("Process File") | |
| output = gr.Textbox(label="File Processing Status") | |
| with gr.Row(): | |
| question_box = gr.Textbox(label="Ask a Question") | |
| ask_btn = gr.Button("Submit") | |
| answer = gr.Textbox(label="Answer") | |
| upload_btn.click(upload_and_process, inputs=file_upload, outputs=output) | |
| ask_btn.click(ask_question, inputs=question_box, outputs=answer) | |
| interface.launch() | |