Source code for wordle.frequency

#!/usr/bin/env python
#
#  frequency.py
"""
Functions to determine word token frequency for wordclouds.

.. versionadded:: 0.2.0
"""
#
#  Copyright (c) 2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
import pathlib
import re
import typing
from collections import Counter
from string import punctuation
from typing import Optional, Sequence

# 3rd party
import pygments.lexers  # type: ignore
import pygments.token  # type: ignore
import pygments.util  # type: ignore
from domdf_python_tools.paths import PathPlus
from domdf_python_tools.typing import PathLike

# this package
from wordle.utils import _TemporaryDirectory, clone_into_tmpdir

__all__ = ["frequency_from_directory", "frequency_from_file", "frequency_from_git", "get_tokens"]


[docs]def get_tokens(filename: PathLike) -> typing.Counter[str]:
	"""
	Returns a :class:`collections.Counter` of the tokens in a file.

	:param filename: The file to parse.

	:return: A count of words etc. in the file.
	"""

	total: typing.Counter[str] = Counter()

	filename = PathPlus(filename)

	try:
		lex = pygments.lexers.get_lexer_for_filename(filename)
	except pygments.util.ClassNotFound:
		return total

	for token in lex.get_tokens(filename.read_text()):
		if token[0] in pygments.token.Comment:
			continue

		if token[0] in pygments.token.Text:
			if token[1] == '\n':
				continue
			if token[1] == ' ':
				continue
			if re.match(r"^\t*$", token[1]):
				continue
			if re.match(r"^\s*$", token[1]):
				continue

		if token[0] in pygments.token.String:
			if token[1] == '"':
				continue

			if token[0] in pygments.token.String.Escape:
				if re.match(r"\\*", token[1]):
					continue

		if token[0] in pygments.token.String.Double:
			if token[1] == '\n':
				continue
			if re.match(r'^"*$', token[1]):
				continue

		if token[0] in pygments.token.String.Single:
			if token[1] == '\n':
				continue
			if re.match(r"^'*$", token[1]):
				continue

		if token[0] in pygments.token.Punctuation and token[1] in "[],{}:();":
			continue

		if token[0] in pygments.token.Operator:
			continue

		if token[0] in pygments.token.String.Affix:
			continue

		if token[0] in pygments.token.String.Interpol and token[1] in "{}":
			continue

		if re.match("^:*$", token[1]):
			continue

		total += Counter(re.split("[ \n\t]", token[1]))

	punctuation_to_delete = ['', ' ']

	for word in total:
		if re.match(f"^[{punctuation}]+$", word):
			punctuation_to_delete.append(word)

	for word in punctuation_to_delete:
		del total[word]

	all_words: typing.Counter[str] = Counter()

	for word in total:
		if word.endswith(':'):
			all_words[word.rstrip(':')] = total[word]
		else:
			all_words[word] = total[word]

	return all_words


[docs]def frequency_from_file(
		filename: PathLike,
		exclude_words: Sequence[str] = (),
		) -> Counter:
	"""
	Returns a dictionary mapping the words in the file to their frequencies.

	:param filename: The file to process
	:param exclude_words: An optional list of words to exclude

	.. versionadded:: 0.2.0

	.. seealso:: func:`~.get_tokens`
	"""

	word_counts = get_tokens(filename)

	for word in exclude_words:
		if word in word_counts:
			del word_counts[word]

	return word_counts


[docs]def frequency_from_directory(
		directory: PathLike,
		exclude_words: Sequence[str] = (),
		exclude_dirs: Sequence[PathLike] = (),
		) -> Counter:
	"""
	Returns a dictionary mapping the words in files in ``directory`` to their frequencies.

	:param directory: The directory to process
	:param exclude_words: An optional list of words to exclude
	:param exclude_dirs: An optional list of directories to exclude.

	.. versionadded:: 0.2.0
	"""

	# TODO: only certain file extensions

	directory = pathlib.Path(directory).absolute()

	exclude_dirs_list = [".git"]

	for d in exclude_dirs:
		d = pathlib.Path(d)

		if d.is_absolute():
			d = d.relative_to(directory)

		exclude_dirs_list.append(str(d))

	def is_excluded(path):
		for dir_name in exclude_dirs_list:
			if re.match(dir_name, path.relative_to(directory).as_posix()):
				return True
		return False

	word_counts: typing.Counter[str] = Counter()

	for file in directory.rglob("**/*.*"):
		if file.is_file() and not is_excluded(file):
			word_counts += get_tokens(file)

	for word in exclude_words:
		if word in word_counts:
			del word_counts[word]

	return word_counts


[docs]def frequency_from_git(
		git_url: str,
		sha: Optional[str] = None,
		depth: Optional[int] = None,
		exclude_words: Sequence[str] = (),
		exclude_dirs: Sequence[PathLike] = (),
		) -> Counter:
	"""
	Returns a dictionary mapping the words in files in ``directory`` to their frequencies.

	:param git_url: The url of the git repository to process
	:param sha: An optional SHA hash of a commit to checkout.
	:param depth: An optional depth to clone at. If :py:obj:`None` and ``sha`` is :py:obj:`None` the depth is ``1``.
		If :py:obj:`None` and ``sha`` is given the depth is unlimited.
	:param exclude_words: An optional list of words to exclude.
	:param exclude_dirs: An optional list of directories to exclude.

	.. versionadded:: 0.2.0
	"""

	with _TemporaryDirectory() as tmpdir:
		clone_into_tmpdir(git_url, tmpdir, sha=sha, depth=depth)

		return frequency_from_directory(
				tmpdir,
				exclude_dirs=exclude_dirs,
				exclude_words=exclude_words,
				)