aarushi-211 commited on
Commit
50b3bb3
·
1 Parent(s): 29a144e

Initial commit of TOS summarizer

Browse files
Files changed (3) hide show
  1. README.md +6 -13
  2. app2.py +49 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,14 +1,7 @@
1
- ---
2
- title: TOS Summarizer
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.38.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Summarize lengthy Terms of Service for better readability
12
- ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
+ # TOS-Summarization
2
+ Using Extractive and Abstractive summarization to summarize Terms and Services documents. <br>
 
 
 
 
 
 
 
 
 
 
3
 
4
+ The Extractive methods used are: KeyPhrase Extraction and TextRank. <br>
5
+ The Abstractive methods used are: BART transformer and PEGASUS transformer fine tuend on the datasets used. <br>
6
+
7
+ Model space on hugging face = https://huggingface.co/spaces/Arjav/TOS-Summarization
app2.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import LEDForConditionalGeneration, LEDTokenizer
3
+ import torch
4
+ import re
5
+
6
+ # Load model and tokenizer just once (outside the function)
7
+ model_name = "aarushi-211/TOS-Longformer"
8
+ model = LEDForConditionalGeneration.from_pretrained(model_name)
9
+ tokenizer = LEDTokenizer.from_pretrained(model_name)
10
+
11
+ def summarize_in_points(Terms):
12
+ # Tokenize input
13
+ input_tokenized = tokenizer.encode(
14
+ Terms, return_tensors='pt', max_length=4096, truncation=True)
15
+
16
+ # Generate summary
17
+ summary_ids = model.generate(input_tokenized,
18
+ num_beams=9,
19
+ no_repeat_ngram_size=3,
20
+ length_penalty=2.0,
21
+ min_length=50,
22
+ max_length=150,
23
+ early_stopping=True)
24
+
25
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
26
+
27
+ # Split into sentences using simple regex
28
+ points = re.split(r'(?<=[.?!])\s+', summary.strip())
29
+ points = [f"• {point}" for point in points if point] # format as bullet points
30
+
31
+ return "\n".join(points)
32
+
33
+ # Gradio interface
34
+ description = "Enter a Terms of Service document to summarize"
35
+ title = "Terms of Service Summarization"
36
+
37
+ interface = gr.Interface(
38
+ fn=summarize_in_points,
39
+ inputs=gr.Textbox(label="Terms of Service", lines=10, placeholder="Paste TOS text here..."),
40
+ outputs=gr.Textbox(label="Summary in Bullet Points", lines=10),
41
+ description=description,
42
+ title=title,
43
+ examples=[
44
+ ["account termination policy youtube will terminate a user s access to the service if under appropriate circumstances the user is determined to be a repeat infringer. youtube reserves the right to decide whether content violates these terms of service for reasons other than copyright infringement such as but not limited to pornography obscenity or excessive length. youtube may at any time without prior notice and in its sole discretion remove such content and or terminate a user s account for submitting such material in violation of these terms of service."]
45
+ ],
46
+ allow_flagging='never'
47
+ )
48
+
49
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit