Source code for sourcespell.sourcespell

# SourceSpell - Command line spell checker for source code files.
#
# Copyright 2016 - Simon J Knibbs <simon.knibbs@gmail.com>

from __future__ import print_function

import os
import codecs
import sys
import argparse
import bisect
import re
import fnmatch
from collections import OrderedDict

import enchant
from enchant.tokenize import get_tokenizer, URLFilter, WikiWordFilter, Filter
import pygments
from pygments import lexers
from pygments.filters import TokenMergeFilter
from pygments.token import Comment, String, Token, Generic, Literal
from colorama import Fore, Back, Style, init

try:
    import magic
except ImportError:
    magic = None

if sys.platform == "win32":
    import msvcrt
    getchar = msvcrt.getch
else:  # POSIX platforms
    import tty
    import termios

[docs]    def getchar():
        """Gets a character from stdin without waiting
        for a newline.

        :returns: A single character from stdin.
        """
        fd = sys.stdin.fileno()
        old = termios.tcgetattr(fd)
        try:
            tty.setraw(fd)
            ch = sys.stdin.read(1)
        finally:
            termios.tcsetattr(fd, termios.TCSADRAIN, old)
        return ch


__version__ = '1.0'
DESCRIPTION = "SourceSpell - Command line spellchecker for source code files."


[docs]class EmptyFileError(Exception):
    """Error thrown for empty files."""
    pass


[docs]class ParseError(Exception):
    """Error thrown for Pygments lexer errors."""
    pass


[docs]class NextFile(Exception):
    """Trigger to advance to the next file."""
    pass


[docs]class HashBangFilter(Filter):
    """Filter skipping over the hashbang in executable scripts.

    Taken from: https://github.com/htgoebel/pysource-spellchecker
    """
    _pattern = re.compile(r"^#!/.+$")

    def _skip(self, word):
        if self._pattern.match(word):
            return True
        return False


class MyWikiWordFilter(WikiWordFilter):

    def _skip(self, text):
        print(text)
        WikiWordFilter._skip(self, text)


[docs]class EmailFilter(enchant.tokenize.EmailFilter):
    """Override the :class:`enchant.tokenize.EmailFilter` to filter out
    addresses enclosed in angle brackets, for example:

        <joe.bloggs@example.com>
    """
    _pattern = re.compile(r"^.+@[^\.].*\.[a-z]{2,}\W?$")


[docs]class SpellingCorrection(object):
    """Object to store information for a spelling
    error.

    :param filename: File path, relative to the base directory.
    :param word: The word being checked.
    :param index: The file index at the start of the word.
    :param line_no: The 1-indexed line number.
    :param column: The column index.
    :param dictionary: Reference to the dictionary object.
    :type dictionary: :class:`enchant.Dict`
    :param line_content: The contents of the line containing the error.
    """

    def __init__(self, filename, word, index, line_no, column, dictionary, line_content):
        self.filename = filename
        self.word = word
        self.index = index
        self.line_no = line_no
        self.column = column
        self.dictionary = dictionary
        self.line_content = line_content.rstrip()

    def __str__(self):
        """Return a string representation of the error, including
        the filename, line and column numbers.
        """
        return (
            "%s - Ln %s Col %s: %s" %
            (self.filename, self.line_no, self.column, self.word)
        )

    @property
    def suggestions(self):
        """The :class:`list` of suggested corrections."""
        return self.dictionary.suggest(self.word)

[docs]    def prompt(self):
        """Generate a prompt listing the available corrections.
        """
        before = self.line_content[:self.column - 1]
        after = self.line_content[self.column - 1 + len(self.word):]
        suggestions = ' | '.join(
            ['%s: %s' % (idx, suggest) for idx, suggest in enumerate(self.suggestions[:10])]
        )
        return '%s%s%s%s%s\n\n%s' % (
            before, Back.RED, self.word,
            Style.RESET_ALL, after, suggestions
        )


[docs]def merge_tokens(stream):
    """Merge tokens of the same type from Pygments.

    Adapted from :class:`pygments.filters.TokenMergeFilter`
    """
    (curr_type, curr_value, curr_index) = (None, None, None)
    for index, ttype, value in stream:
        if ttype is curr_type:
            curr_value += value
        else:
            if curr_type is not None:
                yield (curr_index, curr_type, curr_value)
            (curr_type, curr_value, curr_index) = (ttype, value, index)
    if curr_type is not None:
        yield (curr_index, curr_type, curr_value)


[docs]class SourceFile(object):
    """Interface for checking for spelling errors in a
    single source file.

    :param filename: Absolute path to the file.
    :param dictionary: Enchant dictionary.
    :type dictionary: :class:`enchant.Dict`
    :param tokeniser: Enchant tokeniser from :func:`get_tokenizer`
    :type tokeniser: :class:`enchant.tokenize.Tokenizer`
    :param base_dir: Base directory path.
    :param encoding: Character set encoding to read files with.
    """

    _rawstring_re = re.compile(r'^r["\']')

    def __init__(self, filename, dictionary, tokeniser, base_dir, encoding='utf-8'):
        self.base_dir = base_dir
        self.filename = filename
        self.dict = dictionary
        # List of indexes of line endings for generating line numbers.
        self.line_idxs = []
        try:
            with codecs.open(self.filename, 'r', encoding) as src_file:
                self.content = src_file.read()
                line_lengths = [len(line) for line in self.content.splitlines(True)]
                if len(line_lengths) > 0:
                    count = 0
                    for length in line_lengths:
                        self.line_idxs.append(length + count)
                        count += length
                else:
                    raise EmptyFileError("%s: File empty." % self.relname)
        except UnicodeDecodeError:
            print(
                "%s: Couldn't decode with '%s' codec." % (self.relname, encoding),
                file=sys.stderr
            )
            raise

        self.code_lexer = self._get_lexer()

        self.tokeniser = tokeniser

[docs]    def _get_lexer(self):
        """Initialise the Pygments lexer.
        """
        # TODO: Improve the lexer selection since Jinja and other template languages are
        # often saved with .html template.
        lexer = None
        try:
            lexer = lexers.get_lexer_for_filename(self.filename)
        except pygments.util.ClassNotFound:
            pass

        if magic is not None and lexer is None:
            # Fallback to mimetype detection
            try:
                mimetype = magic.from_file(self.filename, mime=True)
                lexer = lexers.get_lexer_for_mimetype(mimetype)
            except pygments.util.ClassNotFound:
                pass

        if lexer is None:
            try:
                # If all else fails use the guess_lexer method
                lexer = lexers.guess_lexer(self.content[:512])
            except pygments.util.ClassNotFound:
                print("No lexer found for: %s" % self.relname, file=sys.stderr)
                raise

        return lexer

    @property
    def relname(self):
        """Returns the name of the file relative to
        the base directory being checked.
        """
        return self.filename[len(self.base_dir) + 1:]

[docs]    def _index_to_col_lineno(self, index):
        """Calculates the line and column index from the
        file index.

        :param index: The file index.
        :returns: A tuple of line number and column index.
        :rtype: :class:`tuple` of (int, int)
        """
        line = bisect.bisect_right(self.line_idxs, index)
        column = index if line == 0 else index - self.line_idxs[line - 1]
        # Note: line and column numbers are 1-indexed
        return (line + 1, column + 1)

[docs]    def _filter_code_tokens(self, stream):
        """Filter the token stream based on token type and
        the name of the lexer.
        """
        for index, tokentype, value in merge_tokens(stream):
            # Handle token errors
            if tokentype is Token.Error:
                (line, _) = self._index_to_col_lineno(index)
                raise ParseError('%s: Parse error at line %s.' % (self.relname, line))
            # Lex python doc strings with the reStructuredText lexer.
            if tokentype is String.Doc and self.code_lexer.name == 'Python':
                sub_lexer = lexers.get_lexer_by_name('reStructuredText')
                sub_stream = merge_tokens(sub_lexer.get_tokens_unprocessed(value))
                for sub_index, tktype, value in sub_stream:
                    if self._select_token(tokentype, sub_lexer.name, value):
                        yield (index + sub_index, value)
            else:
                if self._select_token(tokentype, self.code_lexer.name, value):
                    yield (index, value)

[docs]    def _select_token(self, tokentype, name, value):
        """Return ``True`` if the token should be used, ``False`` otherwise."""
        # TODO: Make min length configurable.
        MIN_LENGTH = 10

        return (
            (tokentype in Comment and tokentype not in Comment.Preproc) or
            (tokentype in Token.Text) or
            (tokentype in Generic.Emph) or
            (tokentype in Generic.Strong) or
            # Ignore string literals in reStructuredText since
            # these are used class and function references.
            (tokentype in Literal.String and
             len(value) > MIN_LENGTH and
             name != 'reStructuredText' and not
             self._is_rawstring(value))  # Ignore Python raw-string literals
        )

[docs]    def _is_rawstring(self, value):
        """Return ``True`` if value is a Python raw-string literal,
        ``False`` otherwise.
        """
        return self._rawstring_re.match(value) is not None

[docs]    def errors(self):
        """Generator that yields :class:`SpellingCorrection` objects for the current
        source file.
        """
        stream = self.code_lexer.get_tokens_unprocessed(self.content)
        for index, value in self._filter_code_tokens(stream):
            for word, token_index in self.tokeniser(value):
                if not self.dict.check(word):
                    line, column = self._index_to_col_lineno(index + token_index)
                    # Get line content
                    lo = 0 if line == 1 else self.line_idxs[line - 2]
                    line_content = self.content[lo:self.line_idxs[line - 1]]
                    yield SpellingCorrection(
                        self.relname, word, index + token_index,
                        line, column, self.dict, line_content
                    )


[docs]class BaseChecker(object):
    """Common functionality for all checker classes.

    :param base_dir: The path to the base directory.
    :param ignore_patterns: List of glob ignore patterns to skip.
    :param language: ISO language code, e.g. 'en_GB' or 'en_US'
    :param project_dict: Path to the project dictionary for excluded words.
    :param encoding: Character set encoding to use reading / writing files.
    """

    def __init__(self, base_dir='.', ignore_patterns=None, language='en_GB',
                 project_dict=None, encoding='utf-8'):
        self.base_dir = os.path.realpath(base_dir)
        # Ignore common binary file formats and hidden files
        self.ignore_patterns = [
            '*.gif', '*.jpeg', '*.jpg', '*.bmp', '*.png',
            '*.exe', '*.dll', '*.webp', '*.pyc', '*.zip',
            '*.gz', '*/.*'
        ]
        if not os.path.isabs(project_dict):
            project_dict = os.path.abspath(os.path.join(base_dir, project_dict))

        if ignore_patterns is not None:
            self.ignore_patterns.extend(
                [os.path.join(self.base_dir, pattern) for pattern in ignore_patterns]
            )
        self.dictionary = enchant.DictWithPWL(language, project_dict)
        self.ret_code = 0
        self.encoding = encoding

        # TODO: Consider breaking apart WikiWords instead of filtering them out.
        self.tokeniser = get_tokenizer(
            self.dictionary.tag, [EmailFilter, URLFilter, WikiWordFilter, HashBangFilter]
        )

[docs]    def _search_files(self):
        """Generator function which returns files to be checked."""
        for root, dirs, files in os.walk(self.base_dir):
            for name in files:
                filename = os.path.join(root, name)
                if any([fnmatch.fnmatch(filename, i) for i in self.ignore_patterns]):
                    continue
                yield filename

[docs]    def _process_file(self, src_file):
        """Called from run for each source file
        under the base directory.

        :param src_file: The source file being checked.
        :type src_file: :class:`SourceFile`
        """
        raise NotImplementedError

[docs]    def run(self):
        """Runs the checker.

        :returns: The script exit code.
        :rtype: int
        """
        for name in self._search_files():
            try:
                self._process_file(
                    SourceFile(name, self.dictionary, self.tokeniser, self.base_dir, self.encoding)
                )
            except pygments.util.ClassNotFound:
                self.ret_code = 1
                continue
            except ParseError as e:
                print(e, file=sys.stderr)
                self.ret_code = 1
                continue
            except UnicodeDecodeError:
                self.ret_code = 1
                continue
            except (EmptyFileError, NextFile):
                continue  # Skip empty files
            except StopIteration:  # User quit.
                break
        return self.ret_code


[docs]class SpellChecker(BaseChecker):
    """Non-Interactive spell checker. Prints a list of
    all spelling errors found.
    """

[docs]    def _process_file(self, src_file):
        """Prints errors to stderr and sets the error flag."""
        for error in src_file.errors():
            self.ret_code = 1
            print(error, file=sys.stderr)


[docs]class InteractiveChecker(BaseChecker):
    """Interactive spellchecker. Allows the user
    to quickly fix spelling errors and add words to
    the excluded words dictionary.
    """

[docs]    def _print_options(self):
        """Prints the list of keyboard options."""
        codes = OrderedDict([
            ('0-9', 'Use the numbered suggestion.'),
            ('a', 'Ignore the error and add to the excluded words.'),
            ('n', 'Go to the next file, save existing changes.'),
            ('q', 'Exit immediately, discards changes in the current file.')
        ])
        print()
        for code, help in codes.items():
            print("%s - %s" % (code, help))
        print("To skip to the next error, press any other key.")

[docs]    def _handle_response(self, src_map, error):
        """Handle the user response. Return True
        if a correction was made.

        :param src_map: The map of indexes to tokens.
        :type src_map: :class:`collections.OrderedDict`
        :param error: The spelling correction data.
        :type error: :class:`SpellingCorrection`
        """
        correction = False
        print("--->", end=" ")
        response = getchar()
        # Echo response
        print(response)
        # Correct with the numbered correction
        if response.isdigit():
            try:
                src_map[error.index] = error.suggestions[int(response)]
                correction = True
            except IndexError:
                print("%sInvalid selection, please try again.%s" % (Back.RED, Style.RESET_ALL))
                return self._handle_response(error)
        # Add word to the excluded words list
        elif response == "a":
            self.dictionary.add(error.word)
        # Next file
        elif response == "n":
            raise NextFile()
        # Stop spellchecking
        elif response == "q":
            raise StopIteration()
        # Ignore the current error by default
        else:
            pass
        return correction

[docs]    def _process_file(self, src_file):
        """For each error in the file. Prompt the
        user for the action to take.

        :param src_file: Source file being checked.
        :type src_file: :class:`SourceFile`
        """
        write_file = False
        src_map = self._get_source_map(src_file.content)
        for idx, error in enumerate(src_file.errors()):
            if idx == 0:
                print("\n%s%s:%s\n" % (Fore.GREEN, src_file.relname, Style.RESET_ALL))
            print(error.prompt())
            self._print_options()

            write_file |= self._handle_response(src_map, error)

        if write_file:
            with codecs.open(src_file.filename, 'w', self.encoding) as out_file:
                out_file.write(u''.join(src_map.values()))

[docs]    def _get_source_map(self, contents):
        """Creates a map of index, token pairs from the source
        file to handle spelling replacements.

        :param contents: The contents of the source file.
        :returns: The generated map.
        :rtype: :class:`collections.OrderedDict`
        """
        src_map = OrderedDict()
        offset = 0

        for token in re.split(r'(\W+)', contents):
            if token == '':
                continue
            src_map[offset] = token
            offset += len(token)
        return src_map


[docs]def get_parser(description=''):
    """Initialise the command line argument parsing.

    :returns: The argument parser.
    :rtype: :class:`argparse.ArgumentParser`
    """
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument('--directory', '-d', default='.', help='Base directory to search from')
    parser.add_argument(
        '--interactive', '-i', default=False, action='store_true',
        help='Run the interactive checker'
    )
    parser.add_argument(
        '--ignore-patterns', '-I', nargs='+', default=None, help='List of glob patterns to ignore'
    )
    parser.add_argument('--language', '-l', default='en_GB', help='Language to use')
    parser.add_argument(
        '--excluded-words', '-e', default='.excluded-words', help='Path to excluded words list'
    )
    parser.add_argument('--encoding', '-E', default='utf-8', help='Character encoding to use')
    parser.add_argument('--version', '-v', default=False, action='store_true', help='Print version')

    return parser


[docs]def main():
    """Main entry point."""
    parser = get_parser(DESCRIPTION)
    args = parser.parse_args()

    if args.version:
        print(DESCRIPTION)
        print("Version: %s" % __version__)
    else:
        init()  # Initialise colorama
        checker_class = InteractiveChecker if args.interactive else SpellChecker
        checker = checker_class(args.directory, args.ignore_patterns, args.language,
                                args.excluded_words, args.encoding)
        sys.exit(checker.run())


if __name__ == "__main__":
    main()