Spaces:

RoAr777
/

LS

Running

App Files Files Community

RoAr777 commited on Sep 13, 2024

Commit

55b729c

verified ·

1 Parent(s): 0fdf3e3

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -259

app.py CHANGED Viewed

@@ -1,260 +1,193 @@
-import PyPDF2
-import re
-from sentence_transformers import SentenceTransformer
-import faiss
-from langchain.agents import initialize_agent, AgentType,Tool
-from langchain.schema import HumanMessage
-from langchain_google_genai import ChatGoogleGenerativeAI
-import gradio as gr
-import os
-import pytesseract
-from PIL import Image
-pytesseract.pytesseract.tesseract_cmd = r"tesseract.exe"
-def load_pdf_text(file_path):
-    with open(file_path, "rb") as file:
-        reader = PyPDF2.PdfReader(file)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text()
-        return text
-def chunk_text(text, chunk_size=700):
-    # Splits the text into chunks of chunk_size while preserving sentences
-    chunks = []
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) > chunk_size:
-            chunks.append(current_chunk)
-            current_chunk = sentence
-        else:
-            current_chunk += " " + sentence
-    chunks.append(current_chunk)
-    return chunks
-def load_and_process_chapters(directory):
-    chapter_data = {}
-    for filename in os.listdir(directory):
-        if filename.endswith(".pdf"):
-            file_path = os.path.join(directory, filename)
-            text = load_pdf_text(file_path)
-            chunks = chunk_text(text)
-            chapter_data[filename] = chunks # Use filename as key
-    return chapter_data
-ipc_data = load_and_process_chapters("IPC")
-crpc_data=load_and_process_chapters("CrPC")
-# Step 2: Embeddings and Indexing
-model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
-index2 = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
-# Flatten the chapter data and keep track of chapter and chunk indices
-flattened_data = []
-pdf_filenames = [] # Store PDF filenames for citation
-chunk_indices = []
-for pdf_filename, chunks in ipc_data.items():
-    for i, chunk in enumerate(chunks):
-        flattened_data.append(chunk)
-        pdf_filenames.append(pdf_filename)
-        chunk_indices.append(i)
-embeddings = model.encode(flattened_data)
-index.add(embeddings)
-flattened_data2 = []
-pdf_filenames2 = [] # Store PDF filenames for citation
-chunk_indices2 = []
-for pdf_filename, chunks in crpc_data.items():
-    for i, chunk in enumerate(chunks):
-        flattened_data2.append(chunk)
-        pdf_filenames2.append(pdf_filename)
-        chunk_indices2.append(i)
-embeddings = model.encode(flattened_data2)
-index2.add(embeddings)
-# Step 3: Retrieval with Citations using PDF filename
-def retrieve_info_with_citation(query, top_k=5):
-    query_embedding = model.encode([query])
-    D, I = index.search(query_embedding, k=top_k)
-    results = []
-    for i in range(min(top_k, len(I[0]))):
-        if D[0][i] < 1.0:  # Relevance threshold
-            chunk_index = I[0][i]
-            pdf_filename = pdf_filenames[chunk_index]
-            chunk_number = chunk_indices[chunk_index] + 1
-            match = flattened_data[chunk_index]
-            citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
-            results.append((match, citation))
-        else:
-            break
-    if results:
-        return results
-    else:
-        return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
-def retrieve_info_with_citation2(query, top_k=5):
-    query_embedding = model.encode([query])
-    D, I = index2.search(query_embedding, k=top_k)
-    results = []
-    for i in range(min(top_k, len(I[0]))):
-        if D[0][i] < 1.0:  # Relevance threshold
-            chunk_index = I[0][i]
-            pdf_filename = pdf_filenames2[chunk_index]
-            chunk_number = chunk_indices2[chunk_index] + 1
-            match = flattened_data2[chunk_index]
-            citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
-            results.append((match, citation))
-        else:
-            break
-    if results:
-        return results
-    else:
-        return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
-def retrieve_info(query):
-    results = retrieve_info_with_citation(query)
-    formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
-    return formatted_results
-def retrieve_info2(query):
-    results = retrieve_info_with_citation2(query)
-    formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
-    return formatted_results
-ipc_tool = Tool(
-    name="IPC Information Retrieval",
-    func=retrieve_info,
-    description="Retrieve information from the Indian Penal Code Related to query keyword(s)."
-)
-crpc_tool=Tool(
-    name="CrPC Information Retrieval",
-    func=retrieve_info2,
-    description="Retrieve information from the Code of Criminal Procedure(CrPC) Related to query keyword(s)."
-)
-llm = ChatGoogleGenerativeAI(
-    model="gemini-1.5-pro",
-    temperature=0.25,
-    max_tokens=None,
-    timeout=None,
-    max_retries=2,
-    prompt_template="""
-    You are a highly specialized legal assistant with deep knowledge of the Indian Penal Code (IPC).
-    Your primary task is to retrieve and summarize legal information accurately from the IPC.pdf document provided to you.
-    Your responses should be highly specific, fact-based, and free from any speculation or hallucinations.
-    Always cite the exact section from the IPC when providing an answer.
-    If the information is not available in the document, clearly state that and do not make any assumptions.
-    Example task: "What is the punishment for theft according to the IPC?"
-    Example response: "According to Section 379 of the IPC, the punishment for theft is imprisonment of either description for a term which may extend to three years, or with fine, or with both."
-    Task: {{query}}
-    Response:
-    """,
-)
-agent_tools = [ipc_tool,crpc_tool]
-agent = initialize_agent(
-    tools=agent_tools,
-    llm=llm,
-    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
-    verbose=True,
-    return_intermediate_steps=True,
-    handle_parsing_errors=True,
-)
-def encode_image_to_base64(image_path):
-    return pytesseract.image_to_string(Image.open(image_path))
-def chatbot_response(query):
-    if query.get('files'):
-        # Encode image to base64
-        image_data=""
-        for x in range(len(query["files"])):
-            image_data += f"{x}. "+encode_image_to_base64(query["files"][x]) +"\n"
-        # Create a multimodal message with both text and image data
-        message = HumanMessage(
-            content=[
-                {"type": "text", "text": query['text'] +" System :Image(s) was added to this prompt by this user. Text Extracted from this image (Some words may be misspelled ,Use your understanding ):"+image_data},  # Add text input
-            ]
-        )
-    else:
-        # If no image, only pass the text
-        message = HumanMessage(content=[{"type": "text", "text": query}])
-    # Invoke the model with the multimodal message
-    result = agent.invoke([message])
-    response = result['output']
-    intermediate_steps = result.get('intermediate_steps', [])
-    thought_process = ""
-    for action, observation in intermediate_steps:
-        thought_process += f"**Thought:** {action.log}\n"
-        thought_process += f"**Action:** {action.tool}\n"
-        thought_process += f"**Observation:** {observation}\n\n"
-    return response, thought_process.strip()
-# Step 5: Gradio Interface
-from gradio import ChatMessage
-def chatbot_interface(messages,prompt):
-    response, thought_process = chatbot_response(prompt)
-    #messages.append(ChatMessage(role="user", content=prompt))
-    for x in prompt["files"]:
-            messages.append(ChatMessage(role="user", content={"path": x, "mime_type": "image/png"}))
-    if prompt["text"] is not None:
-            messages.append(ChatMessage(role="user", content=prompt['text']))
-    if thought_process:
-        messages.append(ChatMessage(role="assistant", content=thought_process,metadata={"title": "🧠 Thought Process"}))
-    messages.append(ChatMessage(role="assistant", content=response))
-    return messages,  gr.MultimodalTextbox(value=None, interactive=True)
-def vote(data: gr.LikeData):
-    if data.liked:
-        print("You upvoted this response: " + data.value)
-    else:
-        print("You downvoted this response: " + data.value)
-with gr.Blocks(theme=gr.themes.Soft()) as iface:
-            gr.Markdown(
-                """
-                <div style="font-size: 24px; font-weight: bold; color: #333;">
-                    DoJ Chatbot
-                </div>
-                <div style="font-size: 16px; color: #555;">
-                    Ask questions related to the Department of Justice.
-                </div>
-                """
-            )
-            chatbot = gr.Chatbot(type="messages",avatar_images=("user.jpeg", "logo.jpeg"), bubble_full_width=True)  # Chatbot component to display conversation history
-            query_input = gr.MultimodalTextbox(interactive=True,
-                                      placeholder="Enter message or upload file...", show_label=False)
-            submit_button = gr.Button("Send")
-            submit_button.click(chatbot_interface, [chatbot, query_input], [chatbot, query_input])
-            query_input.submit(chatbot_interface, [chatbot, query_input], [chatbot,query_input])
-            chatbot.like(vote, None, None)  # Adding like/dislike functionality to the chatbot
-iface.launch(
-    show_error=True,
-    prevent_thread_lock=True
 )

+import PyPDF2
+import re
+from sentence_transformers import SentenceTransformer
+import faiss
+from langchain.agents import initialize_agent, AgentType,Tool
+from langchain.schema import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+import gradio as gr
+import os
+import pytesseract
+from PIL import Image
+pytesseract.pytesseract.tesseract_cmd = r"tesseract.exe"
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = model.encode(flattened_data)
+index = faiss.read_index('IPC_index.faiss')
+index2 = faiss.read_index('CrPC_index.faiss')
+# Step 3: Retrieval with Citations using PDF filename
+def retrieve_info_with_citation(query, top_k=5):
+    query_embedding = model.encode([query])
+    D, I = index.search(query_embedding, k=top_k)
+    results = []
+    for i in range(min(top_k, len(I[0]))):
+        if D[0][i] < 1.0:  # Relevance threshold
+            chunk_index = I[0][i]
+            citation = f"Source: IPC"
+            results.append((match, citation))
+        else:
+            break
+    if results:
+        return results
+    else:
+        return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
+def retrieve_info_with_citation2(query, top_k=5):
+    query_embedding = model.encode([query])
+    D, I = index2.search(query_embedding, k=top_k)
+    results = []
+    for i in range(min(top_k, len(I[0]))):
+        if D[0][i] < 1.0:  # Relevance threshold
+            chunk_index = I[0][i]
+            citation = f"Source: CrPC"
+            results.append((match, citation))
+        else:
+            break
+    if results:
+        return results
+    else:
+        return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
+def retrieve_info(query):
+    results = retrieve_info_with_citation(query)
+    formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
+    return formatted_results
+def retrieve_info2(query):
+    results = retrieve_info_with_citation2(query)
+    formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
+    return formatted_results
+ipc_tool = Tool(
+    name="IPC Information Retrieval",
+    func=retrieve_info,
+    description="Retrieve information from the Indian Penal Code Related to query keyword(s)."
+)
+crpc_tool=Tool(
+    name="CrPC Information Retrieval",
+    func=retrieve_info2,
+    description="Retrieve information from the Code of Criminal Procedure(CrPC) Related to query keyword(s)."
+)
+llm = ChatGoogleGenerativeAI(
+    model="gemini-1.5-pro",
+    temperature=0.25,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    prompt_template="""
+    You are a highly specialized legal assistant with deep knowledge of the Indian Penal Code (IPC).
+    Your primary task is to retrieve and summarize legal information accurately from the IPC.pdf document provided to you.
+    Your responses should be highly specific, fact-based, and free from any speculation or hallucinations.
+    Always cite the exact section from the IPC when providing an answer.
+    If the information is not available in the document, clearly state that and do not make any assumptions.
+    Example task: "What is the punishment for theft according to the IPC?"
+    Example response: "According to Section 379 of the IPC, the punishment for theft is imprisonment of either description for a term which may extend to three years, or with fine, or with both."
+    Task: {{query}}
+    Response:
+    """,
+)
+agent_tools = [ipc_tool,crpc_tool]
+agent = initialize_agent(
+    tools=agent_tools,
+    llm=llm,
+    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
+    verbose=True,
+    return_intermediate_steps=True,
+    handle_parsing_errors=True,
+)
+def encode_image_to_base64(image_path):
+    return pytesseract.image_to_string(Image.open(image_path))
+def chatbot_response(query):
+    if query.get('files'):
+        # Encode image to base64
+        image_data=""
+        for x in range(len(query["files"])):
+            image_data += f"{x}. "+encode_image_to_base64(query["files"][x]) +"\n"
+        # Create a multimodal message with both text and image data
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": query['text'] +" System :Image(s) was added to this prompt by this user. Text Extracted from this image (Some words may be misspelled ,Use your understanding ):"+image_data},  # Add text input
+            ]
+        )
+    else:
+        # If no image, only pass the text
+        message = HumanMessage(content=[{"type": "text", "text": query}])
+    # Invoke the model with the multimodal message
+    result = agent.invoke([message])
+    response = result['output']
+    intermediate_steps = result.get('intermediate_steps', [])
+    thought_process = ""
+    for action, observation in intermediate_steps:
+        thought_process += f"**Thought:** {action.log}\n"
+        thought_process += f"**Action:** {action.tool}\n"
+        thought_process += f"**Observation:** {observation}\n\n"
+    return response, thought_process.strip()
+# Step 5: Gradio Interface
+from gradio import ChatMessage
+def chatbot_interface(messages,prompt):
+    response, thought_process = chatbot_response(prompt)
+    #messages.append(ChatMessage(role="user", content=prompt))
+    for x in prompt["files"]:
+            messages.append(ChatMessage(role="user", content={"path": x, "mime_type": "image/png"}))
+    if prompt["text"] is not None:
+            messages.append(ChatMessage(role="user", content=prompt['text']))
+    if thought_process:
+        messages.append(ChatMessage(role="assistant", content=thought_process,metadata={"title": "🧠 Thought Process"}))
+    messages.append(ChatMessage(role="assistant", content=response))
+    return messages,  gr.MultimodalTextbox(value=None, interactive=True)
+def vote(data: gr.LikeData):
+    if data.liked:
+        print("You upvoted this response: " + data.value)
+    else:
+        print("You downvoted this response: " + data.value)
+with gr.Blocks(theme=gr.themes.Soft()) as iface:
+            gr.Markdown(
+                """
+                <div style="font-size: 24px; font-weight: bold; color: #333;">
+                    DoJ Chatbot
+                </div>
+                <div style="font-size: 16px; color: #555;">
+                    Ask questions related to the Department of Justice.
+                </div>
+                """
+            )
+            chatbot = gr.Chatbot(type="messages",avatar_images=("user.jpeg", "logo.jpeg"), bubble_full_width=True)  # Chatbot component to display conversation history
+            query_input = gr.MultimodalTextbox(interactive=True,
+                                      placeholder="Enter message or upload file...", show_label=False)
+            submit_button = gr.Button("Send")
+            submit_button.click(chatbot_interface, [chatbot, query_input], [chatbot, query_input])
+            query_input.submit(chatbot_interface, [chatbot, query_input], [chatbot,query_input])
+            chatbot.like(vote, None, None)  # Adding like/dislike functionality to the chatbot
+iface.launch(
+    show_error=True,
+    prevent_thread_lock=True
 )