amritn8 commited on
Commit
d3a1193
Β·
verified Β·
1 Parent(s): 3533296

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import whisper
4
+ import PyPDF2
5
+ import gradio as gr
6
+ from transformers import BertTokenizerFast, BertForQuestionAnswering, pipeline
7
+ from torch.nn.functional import softmax
8
+ from docx import Document
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+
12
+ qa_model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2").to(device)
13
+ tokenizer = BertTokenizerFast.from_pretrained("deepset/bert-base-cased-squad2")
14
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
15
+ whisper_model = whisper.load_model("base")
16
+
17
+
18
+ def extract_text(file_obj):
19
+ ext = os.path.splitext(file_obj.name)[1].lower()
20
+ if ext == ".pdf":
21
+ reader = PyPDF2.PdfReader(file_obj)
22
+ return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
23
+ elif ext == ".docx":
24
+ doc = Document(file_obj)
25
+ return "\n".join([p.text for p in doc.paragraphs])
26
+ elif ext == ".txt":
27
+ return file_obj.read().decode("utf-8")
28
+ return ""
29
+
30
+
31
+ def summarize_text(text):
32
+ if len(text) < 50:
33
+ return "Text too short to summarize."
34
+ if len(text) > 1000:
35
+ text = text[:1000]
36
+ summary = summarizer(text, max_length=120, min_length=30, do_sample=False)
37
+ return summary[0]['summary_text']
38
+
39
+
40
+ def ask_question(question, context):
41
+ inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512).to(device)
42
+ with torch.no_grad():
43
+ outputs = qa_model(**inputs)
44
+ start_idx = torch.argmax(outputs.start_logits)
45
+ end_idx = torch.argmax(outputs.end_logits) + 1
46
+ score = softmax(outputs.start_logits, dim=1)[0][start_idx] * softmax(outputs.end_logits, dim=1)[0][end_idx - 1]
47
+ answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx])
48
+ return f"Answer: {answer.strip()}\nConfidence: {round(score.item()*100, 2)}%"
49
+
50
+
51
+ def transcribe(audio_path):
52
+ result = whisper_model.transcribe(audio_path)
53
+ return result["text"]
54
+
55
+
56
+ with gr.Blocks() as demo:
57
+ gr.Markdown("# πŸŽ™οΈπŸ“„ LexPilot: Voice + Document Q&A Assistant")
58
+ gr.Markdown("Upload a document or paste content. Ask questions by typing or using your voice.")
59
+
60
+ with gr.Tab("Question Answering"):
61
+ with gr.Row():
62
+ uploaded_file = gr.File(label="Upload .pdf / .docx / .txt", file_types=[".pdf", ".docx", ".txt"])
63
+ pasted_text = gr.Textbox(label="Paste text manually", lines=10)
64
+ with gr.Row():
65
+ typed_question = gr.Textbox(label="Type your question")
66
+ audio_input = gr.Audio(source="microphone",type="filepath", label="Or speak your question")
67
+ qa_btn = gr.Button("Get Answer")
68
+ qa_output = gr.Textbox(label="Answer and Confidence", lines=3)
69
+
70
+ def handle_qa(file, text, typed, audio):
71
+ context = ""
72
+ if file:
73
+ context = extract_text(file)
74
+ elif text:
75
+ context = text
76
+ else:
77
+ return "❗ Please upload or paste content."
78
+
79
+ if typed:
80
+ question = typed
81
+ elif audio:
82
+ question = transcribe(audio)
83
+ else:
84
+ return "❗ Please speak or type a question."
85
+
86
+ return ask_question(question, context)
87
+
88
+ qa_btn.click(handle_qa, inputs=[uploaded_file, pasted_text, typed_question, audio_input], outputs=qa_output)
89
+
90
+ with gr.Tab("Summarization"):
91
+ with gr.Row():
92
+ sum_file = gr.File(label="Upload .pdf / .docx / .txt", file_types=[".pdf", ".docx", ".txt"])
93
+ sum_text = gr.Textbox(label="Or paste content", lines=10)
94
+ sum_btn = gr.Button("Summarize")
95
+ sum_output = gr.Textbox(label="Summary", lines=4)
96
+
97
+ def handle_summary(file, text):
98
+ if file:
99
+ context = extract_text(file)
100
+ elif text:
101
+ context = text
102
+ else:
103
+ return "❗ Please upload or paste content to summarize."
104
+ return summarize_text(context)
105
+
106
+ sum_btn.click(handle_summary, inputs=[sum_file, sum_text], outputs=sum_output)
107
+
108
+ demo.launch()
lexpilot_adavnced.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
lexpilot_embeddings_tensor.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44736d79307d6ac511c136c62cad24901b60b0c622780e603fe2b2404737e184
3
+ size 9438523
lexpilot_trained_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe0ab544c7642ed9bda903288c02940d2da063d20983c64c8636c5b31a26369
3
+ size 435656713
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ PyPDF2
4
+ python-docx
5
+ gradio
6
+ git+https://github.com/openai/whisper.git