#! /usr/bin/env python3 # Based on code from https://stackabuse.com/text-summarization-with-nltk-in-python/ """ A simple & fast tool for synopsizing documents, for those who morally object to LLMs. It can be used as a Python module, terminal command, or web UI. Install its dependencies by running: pip install nltk flask textxtract[all] Run this script (after installing dependencies) to initiate a debug-mode webserver. Run this script with filenames to synopsize those files. Precede a commandline argument with - to set how many sentences of output are generated. Precede a commandline argument with + to set the language for better results. See https://flask.palletsprojects.com/en/stable/deploying/ for deployment instructions. Please deploy this yourself to share with anyone you know might be interested. """ NAME = "this synopsization service" """Core module""" import nltk nltk.download('punkt') nltk.download('stopwords') def synopsize(article_text, lang='english', out_len=7): sentence_list = nltk.sent_tokenize(article_text) stopwords = nltk.corpus.stopwords.words(lang) if lang and len(lang) else [] word_frequencies = {} for word in nltk.word_tokenize(article_text): if word not in stopwords: if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 maximum_frequncy = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) sentence_scores = {} for sent in sentence_list: for word in nltk.word_tokenize(sent.lower()): if word in word_frequencies.keys(): if len(sent.split(' ')) < 30: if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word] else: sentence_scores[sent] += word_frequencies[word] import heapq summary_sentences = heapq.nlargest(out_len, sentence_scores, key=sentence_scores.get) return ' '.join(summary_sentences) """Web interface!""" from flask import Flask, request, Response from textxtract import SyncTextExtractor app = webapp = Flask(__name__) @app.get('/') def home(): return """
Confidently synopsize text, guilt free!
While no software (not even modern LLMs) can match the quality of a talented human at summarization, ?? is at least guaranteed to be honest and short!
Its development and use incurs no electricity cost beyond what would've been used anyways, probably less. Its computational efficiency is approaching the theoretical limit.
Made with ♡ by Adrian Cochrane aided by NLTK, TextXTract, & Stack Abuse.
""".replace('??', NAME) @webapp.post('/') def web_synopsize(): resp = "" if request.form.get('per-doc'): if len(request.files.getlist('docs')) == 0: return "What did you want to synopsize again?" extractor = SyncTextExtractor() res = [] for doc in request.files.getlist('docs'): synopsis = synopsize(extractor.extract(doc.read(), doc.filename), request.form['lang'], int(request.form['length'])) res.append(doc.filename + ":\n" + synopsis) resp = "\n\n".join(res) else: if request.form['text'].strip() == "" and len(request.files.getlist('docs')) == 0: return "What did you want to synopsize again?" text = request.form['text'] extractor = SyncTextExtractor() for doc in request.files.getlist('docs'): if doc.filename.strip() == '': continue text += "\n\n" + extractor.extract(doc.read(), doc.filename) resp = synopsize(text, request.form['lang'], int(request.form['length'])) return Response(resp, mimetype='text/plain') @webapp.get('/source') def viewsource(): with open(__file__) as f: return Response(f.read(), mimetype='text/plain') """Commandline interface""" if __name__ == '__main__': from sys import argv, stdin if len(argv) <= 1: webapp.run(debug=True) exit() article = [] out_len = 7 lang = "english" extractor = SyncTextExtractor() for filename in argv[1:]: if filename[0] == '-': out_len = int(filename[1:]) elif filename[0] == '+': lang = filename[1:] else: article.append(extractor.extract(filename)) article_text = "\n\n".join(article) print(synopsize(article_text, lang, out_len))