Spaces:
Sleeping
Sleeping
File size: 3,101 Bytes
51c2867 1ee87ce b373765 f28212f 155b271 b373765 155b271 ffd7a87 b373765 ffd7a87 b373765 f28212f 1ee87ce 155b271 f28212f 1ee87ce 155b271 f28212f 1ee87ce 155b271 b373765 f28212f 155b271 b373765 155b271 a5a3b55 155b271 b373765 1ee87ce 155b271 b373765 155b271 1ee87ce b373765 155b271 b373765 1ee87ce b373765 1ee87ce b373765 1ee87ce b373765 155b271 b373765 155b271 45d1cd8 b373765 51c2867 1ee87ce 45d1cd8 b373765 1ee87ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
import os
from groq import Groq
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
# --- Helper Functions ---
def initialize_groq():
return Groq(api_key=os.getenv("GROQ_API_KEY"))
def clean_question(user_question):
corrections = {"slaps": "slabs", "salried": "salaried"}
for wrong, right in corrections.items():
user_question = user_question.replace(wrong, right)
return user_question
def read_pdf(uploaded_file):
try:
with pdfplumber.open(uploaded_file.name) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
full_text += text
if not full_text.strip():
# OCR fallback
images = convert_from_path(uploaded_file.name)
full_text = ""
for img in images:
text = pytesseract.image_to_string(img)
full_text += text
return full_text.strip()
except Exception as e:
return f"Error reading PDF: {e}"
def chunk_text(text, chunk_size=3000):
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def similarity(query, text):
query_words = set(query.lower().split())
text_words = set(text.lower().split())
return len(query_words & text_words)
def retrieve_relevant_document(user_question, document_text):
chunks = chunk_text(document_text)
return max(chunks, key=lambda chunk: similarity(user_question, chunk)) if chunks else ""
def answer_question(file, user_question):
if not file:
return "Please upload a PDF document."
user_question = clean_question(user_question)
document_text = read_pdf(file)
if not document_text:
return "❌ Document appears empty or unreadable. Please try a different file."
relevant_chunk = retrieve_relevant_document(user_question, document_text)
prompt = f"""You are a tax/legal assistant. Read the following extract and answer the user's query.
User Question: {user_question}
Relevant Extract from Document:
{relevant_chunk}
"""
try:
client = initialize_groq()
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192"
)
return response.choices[0].message.content
except Exception as e:
return f"Error generating answer: {e}"
# --- Gradio UI ---
def create_interface():
with gr.Blocks() as demo:
gr.Markdown("## 📄 Legal Document Q&A\nUpload a PDF and ask questions based on its content.")
file_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
question_input = gr.Textbox(label="Your Question")
answer_output = gr.Textbox(label="Answer")
submit = gr.Button("Ask")
submit.click(fn=answer_question, inputs=[file_input, question_input], outputs=answer_output)
return demo
# Launch
if __name__ == "__main__":
demo = create_interface()
demo.launch()
|