mo-mazen commited on
Commit
9c2b411
·
verified ·
1 Parent(s): 70703b9

Upload insightX.py

Browse files
Files changed (1) hide show
  1. insightX.py +95 -0
insightX.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run this app with:
2
+ # python -m streamlit run "d:/Code/project 1/insightX.py"
3
+ import streamlit as st
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+ import torch
6
+ import pandas as pd
7
+ import docx2txt
8
+ import PyPDF2
9
+
10
+ # Load model and tokenizer
11
+ @st.cache_resource(show_spinner=False)
12
+ def load_model():
13
+ tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")
14
+ model = AutoModelForSeq2SeqLM.from_pretrained("google/long-t5-tglobal-base")
15
+ return tokenizer, model
16
+
17
+ tokenizer, model = load_model()
18
+
19
+ # Initialize chat history
20
+ if "messages" not in st.session_state:
21
+ st.session_state.messages = []
22
+
23
+ st.title("🧠 InsightX Chat")
24
+ st.write("Chat with Long-T5 to summarize, rewrite, or explore long-form text. You can also upload a file.")
25
+
26
+ # Summary length slider
27
+ max_output_length = st.slider("Summary length (tokens)", min_value=128, max_value=1024, value=512)
28
+
29
+ # Chunking function
30
+ def chunk_text(text, chunk_size=16384):
31
+ tokens = tokenizer.encode(text)
32
+ return [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
33
+
34
+ # Summarization function
35
+ def summarize_long_text(text):
36
+ chunks = chunk_text(text)
37
+ summaries = []
38
+ for chunk in chunks:
39
+ input_ids = torch.tensor([chunk])
40
+ with torch.no_grad():
41
+ output_ids = model.generate(input_ids, max_length=max_output_length)
42
+ summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
43
+ summaries.append(summary)
44
+ return "\n\n".join(summaries)
45
+
46
+ # File uploader
47
+ uploaded_file = st.file_uploader("Upload a file (PDF, Word, Excel, CSV)", type=["pdf", "docx", "xlsx", "csv"])
48
+ file_text = ""
49
+
50
+ if uploaded_file:
51
+ file_type = uploaded_file.name.split(".")[-1].lower()
52
+ try:
53
+ if file_type == "pdf":
54
+ reader = PyPDF2.PdfReader(uploaded_file)
55
+ file_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
56
+ elif file_type == "docx":
57
+ file_text = docx2txt.process(uploaded_file)
58
+ elif file_type == "xlsx":
59
+ df = pd.read_excel(uploaded_file)
60
+ file_text = df.to_string(index=False)
61
+ elif file_type == "csv":
62
+ df = pd.read_csv(uploaded_file)
63
+ file_text = df.to_string(index=False)
64
+ except Exception as e:
65
+ st.error(f"Error reading file: {e}")
66
+
67
+ if file_text:
68
+ st.session_state.messages.append({"role": "user", "content": f"(Uploaded file)\n{file_text}"})
69
+ with st.chat_message("user"):
70
+ with st.expander("View Uploaded Text"):
71
+ st.text_area("File Content", file_text, height=300)
72
+
73
+ output_text = summarize_long_text(file_text)
74
+
75
+ st.session_state.messages.append({"role": "assistant", "content": output_text})
76
+ with st.chat_message("assistant"):
77
+ st.markdown(output_text)
78
+
79
+ # Chat input
80
+ user_input = st.chat_input("Type your message or paste long text here...")
81
+
82
+ if user_input:
83
+ st.session_state.messages.append({"role": "user", "content": user_input})
84
+ with st.chat_message("user"):
85
+ st.markdown(user_input)
86
+
87
+ # Custom response for "hello"
88
+ if user_input.strip().lower() == "hello":
89
+ output_text = "How can I help you?"
90
+ else:
91
+ output_text = summarize_long_text(user_input)
92
+
93
+ st.session_state.messages.append({"role": "assistant", "content": output_text})
94
+ with st.chat_message("assistant"):
95
+ st.markdown(output_text)