Source code for wordle

#!/usr/bin/env python
#
#  __init__.py
"""
Create wordclouds from git repositories, directories and source files.
"""
#
#  Copyright (c) 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#
#  Based on "wordcloud" by Andreas Christian Mueller and Paul Nechifor.
#  Copyright (c) 2012
#  MIT Licensed
#

# stdlib
import os
import pathlib
import sys
import time
import typing
from typing import Callable, Optional, Sequence, Union

# 3rd party
import numpy
from domdf_python_tools.typing import PathLike
from matplotlib.colors import Colormap  # type: ignore
from numpy.random.mtrand import RandomState
from wordcloud import WordCloud  # type: ignore

# this package
from wordle.frequency import frequency_from_directory, frequency_from_file, get_tokens
from wordle.utils import _TemporaryDirectory, clone_into_tmpdir

__author__: str = "Dominic Davis-Foster"
__copyright__: str = "2020 Dominic Davis-Foster"
__license__: str = "MIT License"
__version__: str = "0.2.1"
__email__: str = "dominic@davis-foster.co.uk"

__all__ = ["Wordle", "export_wordcloud", "get_tokens"]


[docs]class Wordle(WordCloud): r""" Generate word clouds from source code. :param font_path: Font path to the font that will be used (OTF or TTF). Defaults to DroidSansMono path on a Linux machine. If you are on another OS or don't have this font, you need to adjust this path. :param width: The width of the canvas. :param height: The height of the canvas. :param prefer_horizontal: The ratio of times to try horizontal fitting as opposed to vertical. If prefer_horizontal < 1, the algorithm will try rotating the word if it doesn't fit. (There is currently no built-in way to get only vertical words.) :param mask: If not :py:obj:`None`, gives a binary mask on where to draw words. If mask is not :py:obj:`None`, width and height will be ignored and the shape of mask will be used instead. All white (``#FF`` or ``#FFFFFF``) entries will be considerd "masked out" while other entries will be free to draw on. :param contour_width: If mask is not :py:obj:`None` and contour_width > 0, draw the mask contour. :param contour_color: Mask contour color. :param scale: Scaling between computation and drawing. For large word-cloud images, using scale instead of larger canvas size is significantly faster, but might lead to a coarser fit for the words. :param min_font_size: Smallest font size to use. Will stop when there is no more room in this size. :param font_step: Step size for the font. ``font_step`` > 1 might speed up computation but give a worse fit. :param max_words: The maximum number of words. :param background_color: Background color for the word cloud image. :param max_font_size: Maximum font size for the largest word. If :py:obj:`None` the height of the image is used. :param mode: Transparent background will be generated when mode is "RGBA" and background_color is None. :param relative_scaling: Importance of relative word frequencies for font-size. With relative_scaling=0, only word-ranks are considered. With relative_scaling=1, a word that is twice as frequent will have twice the size. If you want to consider the word frequencies and not only their rank, relative_scaling around .5 often looks good. If 'auto' it will be set to 0.5 unless repeat is true, in which case it will be set to 0. :param color_func: Callable with parameters ``word``, ``font_size``, ``position``, ``orientation``, ``font_path``, ``random_state`` which returns a PIL color for each word. Overwrites "colormap". See ``colormap`` for specifying a matplotlib colormap instead. To create a word cloud with a single color, use ``color_func=lambda *args, **kwargs: "white"``. The single color can also be specified using RGB code. For example ``color_func=lambda *args, **kwargs: (255,0,0)`` sets the color to red. :param regexp: Regular expression to split the input text into tokens in process_text. If None is specified, ``r"\w[\w']+"`` is used. Ignored if using generate_from_frequencies. :param collocations: Whether to include collocations (bigrams) of two words. Ignored if using generate_from_frequencies. :param colormap: Matplotlib colormap to randomly draw colors from for each word. Ignored if "color_func" is specified. Default "viridis". :no-default colormap: :param repeat: Whether to repeat words and phrases until max_words or min_font_size is reached. :param include_numbers: Whether to include numbers as phrases or not. :param min_word_length: Minimum number of letters a word must have to be included. :param random_state: Seed for the randomness that determines the colour and position of words. .. note:: Larger canvases with make the code significantly slower. If you need a large word cloud, try a lower canvas size, and set the scale parameter. The algorithm might give more weight to the ranking of the words than their actual frequencies, depending on the ``max_font_size`` and the scaling heuristic. """ color_func: Callable """ Callable with parameters ``word``, ``font_size``, ``position``, ``orientation``, ``font_path``, ``random_state`` which returns a PIL color for each word. """ def to_html(self): # noqa: D102 raise NotImplementedError def __init__( self, font_path: Optional[str] = None, width: int = 400, # 1920 height: int = 200, # 1080 prefer_horizontal: float = 0.90, mask: Optional[numpy.ndarray] = None, contour_width: float = 0, contour_color: str = "black", scale: float = 1, min_font_size: int = 4, font_step: int = 1, max_words: int = 200, background_color: str = "black", max_font_size: Optional[int] = None, mode: str = "RGB", relative_scaling: Union[str, float] = "auto", color_func: Optional[Callable] = None, regexp: Optional[str] = None, collocations: bool = True, colormap: Union[None, str, Colormap] = None, repeat: bool = False, include_numbers: bool = False, min_word_length: int = 0, # margin=2, # ranks_only=None, random_state: Union[RandomState, int, None] = None, ) -> None: super().__init__( font_path=font_path, width=width, height=height, prefer_horizontal=prefer_horizontal, mask=mask, contour_width=contour_width, contour_color=contour_color, scale=scale, min_font_size=min_font_size, font_step=font_step, max_words=max_words, background_color=background_color, max_font_size=max_font_size, mode=mode, relative_scaling=relative_scaling, color_func=color_func, regexp=regexp, collocations=collocations, colormap=colormap, repeat=repeat, include_numbers=include_numbers, min_word_length=min_word_length, # margin=margin, # ranks_only=ranks_only, random_state=random_state, )
[docs] def __array__(self) -> numpy.ndarray: # pragma: no cover (typed wrapper) """ Returns the wordcloud image as numpy array. """ return super().__array__()
[docs] def generate_from_file( self, filename: PathLike, outfile: Optional[PathLike] = None, *, exclude_words: Sequence[str] = (), max_font_size: Optional[int] = None ) -> "Wordle": """ Create a word_cloud from a source code file. :param filename: The file to process :param outfile: The file to save the wordle as. Supported formats are ``PNG``, ``JPEG`` and ``SVG``. If :py:obj:`None` the wordle is not saved :param exclude_words: An optional list of words to exclude :param max_font_size: Use this font-size instead of :attr:`~Wordle.max_font_size`. .. versionchanged:: 0.2.1 ``exclude_words``, ``max_font_size`` are now keyword-only. """ word_counts = frequency_from_file(filename, exclude_words) self.generate_from_frequencies(word_counts, max_font_size=max_font_size) if outfile: export_wordcloud(self, outfile) return self
[docs] def generate_from_directory( self, directory: PathLike, outfile: Optional[PathLike] = None, *, exclude_words: Sequence[str] = (), exclude_dirs: Sequence[PathLike] = (), max_font_size: Optional[int] = None ) -> "Wordle": """ Create a word_cloud from a directory of source code files. :param directory: The directory to process :param outfile: The file to save the wordle as. Supported formats are ``PNG``, ``JPEG`` and SVG. If :py:obj:`None` the wordle is not saved. :param exclude_words: An optional list of words to exclude :param exclude_dirs: An optional list of directories to exclude. Each entry is treated as a regular expression to match at the beginning of the relative path. :param max_font_size: Use this font-size instead of :attr:`~Wordle.max_font_size`. .. versionchanged:: 0.2.1 ``exclude_words``, ``exclude_dirs``, ``max_font_size`` are now keyword-only. """ word_counts: typing.Counter[str] = frequency_from_directory( directory, exclude_words=exclude_words, exclude_dirs=exclude_dirs, ) self.generate_from_frequencies(word_counts, max_font_size=max_font_size) if outfile is not None: export_wordcloud(self, outfile) # with open("wordcount.json", "w") as fp: # json.dump(word_counts, fp) return self
[docs] def generate_from_git( self, git_url: str, outfile: Optional[PathLike] = None, *, sha: Optional[str] = None, depth: Optional[int] = None, exclude_words: Sequence[str] = (), exclude_dirs: Sequence[PathLike] = (), max_font_size: Optional[int] = None ) -> "Wordle": """ Create a word_cloud from a directory of source code files. :param git_url: The url of the git repository to process :param outfile: The file to save the wordle as. Supported formats are ``PNG``, ``JPEG`` and SVG. If :py:obj:`None` the wordle is not saved :param sha: An optional SHA hash of a commit to checkout. :param depth: An optional depth to clone at. If :py:obj:`None` and ``sha`` is :py:obj:`None` the depth is ``1``. If :py:obj:`None` and ``sha`` is given the depth is unlimited. :param exclude_words: An optional list of words to exclude. :param exclude_dirs: An optional list of directories to exclude. :param max_font_size: Use this font-size instead of self.max_font_size. .. versionchanged:: 0.2.1 * ``exclude_words``, ``exclude_dirs``, ``max_font_size`` are now keyword-only. * Added the ``sha`` and ``depth`` keyword-only arguments. """ with _TemporaryDirectory() as tmpdir: clone_into_tmpdir(git_url, tmpdir, sha=sha, depth=depth) self.generate_from_directory( tmpdir, outfile=outfile, exclude_dirs=exclude_dirs, exclude_words=exclude_words, max_font_size=max_font_size, ) if sys.platform == "win32": time.sleep(5) # pragma: no cover (!Windows) return self
[docs] def recolor( # pragma: no cover (typed wrapper) self, random_state: Union[RandomState, int, None] = None, color_func: Optional[Callable] = None, colormap: Union[None, str, Colormap] = None, ) -> "Wordle": """ Recolour the existing layout. Applying a new coloring is much faster than regenerating the whole wordle. :param random_state: If not :py:obj:`None`, a fixed random state is used. If an :class:`int` is given, this is used as seed for a :class:`random.Random` state. :param color_func: Function to generate new color from word count, font size, position and orientation. If :py:obj:`None`, :attr:`~Wordle.color_func` is used. :param colormap: Use this colormap to generate new colors. Ignored if ``color_func`` is specified. If :py:obj:`None`, :attr:`~Wordle.color_func` or :attr:`~Wordle.color_map` is used. :returns: self """ return super().recolor(random_state, color_func, colormap)
[docs] def to_array(self): # pragma: no cover (typed wrapper) """ Returns the wordcloud image as numpy array. """ return super().to_array()
[docs] def to_file(self, filename: PathLike): """ Export the wordle to a file. :param filename: The file to save as. :returns: self """ return super().to_file(os.fspath(filename))
[docs] def to_image(self): """ Returns the wordcloud as an image. """ return super().to_image()
[docs] def to_svg( self, *, embed_font: bool = False, optimize_embedded_font: bool = True, embed_image: bool = False, ) -> str: """ Export the wordle to an SVG. :param embed_font: Whether to include font inside resulting SVG file. :param optimize_embedded_font: Whether to be aggressive when embedding a font, to reduce size. In particular, hinting tables are dropped, which may introduce slight changes to character shapes (w.r.t. `to_image` baseline). :param embed_image: Whether to include rasterized image inside resulting SVG file. Useful for debugging. :returns: The content of the SVG image. """ return super().to_svg(embed_font, optimize_embedded_font, embed_image)
[docs]def export_wordcloud(word_cloud: WordCloud, outfile: PathLike) -> None: """ Export a wordcloud to a file. :param word_cloud: :param outfile: The file to export the wordcloud to. """ outfile = pathlib.Path(outfile) if outfile.suffix == ".svg": outfile.write_text(word_cloud.to_svg()) else: word_cloud.to_file(str(outfile))