In [1]:
import requests
import pandas as pd

In [2]:
pip install -U gensim

Note: you may need to restart the kernel to use updated packages.


In [105]:
fm_key = '8a1cf9d14d53126b05c8a7636fe3d006'

ticker = 'AAPL'
stock_news = requests.get(f'https://financialmodelingprep.com/api/v3/stock_news?tickers={ticker}&limit=100&apikey={fm_key}').json()
stock_news = pd.DataFrame(stock_news)
stock_news = stock_news[stock_news['site'].isin(['24/7 Wall Street', 'CNBC', 'Forbes', 'Barrons', 'Market Watch'])]

text_str = ' '.join(str(x) for x in stock_news['text'])

# print(text_str)

### 1. Summerization

In [106]:
import gensim
from gensim.summarization import summarize

text_summ = summarize(text_str)
print(text_summ)

Apple Inc. plans to focus on camera upgrades, particularly around video, with its next iPhone launch, according to a report.
Apple is reportedly planning a video Portrait mode with several new photo and video updates on the new iPhone, Bloomberg reported Tuesday.
During Apple's Q3 earnings call last week, CEO Tim Cook cautioned that sales of products including the iPhone and iPad could be impacted by the semiconductor shortage.


### 2. Extraction with importance:   
- Refer to: https://medium.com/analytics-vidhya/sentence-extraction-using-textrank-algorithm-7f5c8fd568cd

In [107]:
import re
from pprint import pprint

import numpy as np
from nltk import sent_tokenize, word_tokenize

from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

def normalize_whitespace(text):
    """
    Translates multiple whitespace into single space character.
    If there is at least one new line character chunk is replaced
    by single LF (Unix new line) character.
    """
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)


def _replace_whitespace(match):
    text = match.group()

    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "
    

def is_blank(string):
    """
    Returns `True` if string contains only white-space characters
    or is empty. Otherwise `False` is returned.
    """
    return not string or string.isspace()


def get_symmetric_matrix(matrix):
    """
    Get Symmetric matrix
    :param matrix:
    :return: matrix
    """
    return matrix + matrix.T - np.diag(matrix.diagonal())


def core_cosine_similarity(vector1, vector2):
    """
    measure cosine similarity between two vectors
    :param vector1:
    :param vector2:
    :return: 0 < cosine similarity value < 1
    """
    return 1 - cosine_distance(vector1, vector2)

class TextRank4Sentences():
    
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)
    
    def _build_similarity_matrix(self, sentences, stopwords=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        # Get Symmeric matrix
        sm = get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is to ignore the 0 element in norm

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector
    
    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):

        top_sentences = {}

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(number):
                #print (str(sorted_pr[index]) + " : " + str(self.pr_vector[sorted_pr[index]]))
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences[sent] = self.pr_vector[sorted_pr[index]]
                index += 1

        return top_sentences
    
    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]

        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)

        self.pr_vector = self._run_page_rank(similarity_matrix)
        
        #print(self.pr_vector)

tr4sh = TextRank4Sentences()
tr4sh.analyze(text_str)
# pprint(tr4sh.get_top_sentences(5), width=1, depth=2)
top_sents = tr4sh.get_top_sentences(5)
top_sents

{"Apple's new system, which is in testing in the U.S. now, was also vociferously opposed by privacy advocates who warned it represents a slippery slope Xiaomi's sales grew 26% on-month, giving the company a 17.1% share of the total number of smartphones sold globally in June, Counterpoint Research said.": 1.2920086226548468,
 'Senators introduced a bipartisan bill Wednesday that seeks to rein in the control Google and Apple Inc. have over their respective mobile-app stores, part of a larger wave of antitrust legislation in congress.': 1.2585224012779395,
 'Affirm plans to offer the option to pay off iPhones, Macs, and iPads over an extended period in Canada, according to a Bloomberg report.': 1.2012919754962903,
 'British Columbia Investment Management raised stakes in Apple, Microsoft, and Shopify, and initiated a stake in marijuana stock Tilray in the second quarter.': 1.1932048305607537,
 'Apple Inc. plans to focus on camera upgrades, particularly around video, with its next iPhone 

### Plot:

In [167]:
# from io import BytesIO

# from matplotlib.figure import Figure
# import matplotlib.pyplot as plt
# from matplotlib.transforms import IdentityTransform

# def text_to_rgba(s, *, dpi, **kwargs):
#     # To convert a text string to an image, we can:
#     # - draw it on an empty and transparent figure;
#     # - save the figure to a temporary buffer using ``bbox_inches="tight",
#     #   pad_inches=0`` which will pick the correct area to save;
#     # - load the buffer using ``plt.imread``.
#     #
#     # (If desired, one can also directly save the image to the filesystem.)
#     fig = Figure(facecolor="none")
#     fig.text(0, 0, s, **kwargs)
#     buf = BytesIO()
#     fig.savefig(buf, dpi = dpi, format = "png", bbox_inches = "tight", pad_inches = 0)
#     buf.seek(0)
#     rgba = plt.imread(buf)
#     return rgba

# fig = plt.figure(figsize = (10,8))
# rgba_temp = text_to_rgba(r"what", color="blue", fontsize=20, dpi=200)
# fig.figimage(rgba_temp, 100, 50)
# fig.text(100, 350, r"some other string", color="red", fontsize=20, transform=IdentityTransform())
# plt.show()

In [158]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# wc = WordCloud(width = 6400, height = 6400,\
#                background_color = None, mode = "RGBA", min_font_size = 18).generate_from_frequencies(top_sents)
# plt.figure(figsize = (20, 20))
# plt.imshow(wc)
# plt.axis('off')
# plt.show()

In [157]:
# from PIL import Image, ImageDraw

# img = Image.new('RGB', (1600, 1600), color = (255, 255, 255))
# d = ImageDraw.Draw(img)

# t = 1
# for key in top_sents:
#     d.text((20*t, 20*t), str(key), fill=(255, 0, 0))
#     t += 1
    
# img.show()