Source code for nkdsu.apps.vote.parsers

import re
from dataclasses import dataclass
from typing import Iterable, Optional

from django.conf import settings
from django.urls import reverse

from sly import Lexer
from sly.lex import LexError


[docs] @dataclass(frozen=True) class ArtistChunk: text: str is_artist: bool @property def url(self) -> Optional[str]: return ( reverse('vote:artist', kwargs={'artist': self.text}) if self.is_artist else None ) @property def worth_linking_to(self) -> bool: from .models import Track return bool(self.is_artist and Track.objects.by_artist(self.text))
[docs] @dataclass(frozen=True) class ParsedArtist: chunks: list[ArtistChunk] should_collapse: bool def __iter__(self) -> Iterable[ArtistChunk]: return iter(self.chunks)
SPECIAL_CASES: list[tuple[str, str, str]] = [ ('FLOW', 'x', '&'), ('SawanoHiroyuki[nZk]', ':', '&'), ('(K)NoW_NAME', ':', '&'), ] _SPECIAL_CASE_TOKEN = r'^{}$'.format( '|'.join( r'({}.*)'.format(re.escape(startswith + split)) for startswith, split, collab_delim in SPECIAL_CASES ) )
[docs] class ArtistLexer(Lexer): tokens = { SPECIAL_CASE, # type: ignore # noqa ARTIST_COMPONENT, # type: ignore # noqa SPACE, # type: ignore # noqa COMMA, # type: ignore # noqa VIA, # type: ignore # noqa LPAREN, # type: ignore # noqa RPAREN, # type: ignore # noqa CV, # type: ignore # noqa } SPECIAL_CASE = _SPECIAL_CASE_TOKEN VIA = ( r'\s+(' r'from|' r'ft\.|' r'feat(\.|uring)?\.?|' r'[Ss]tarring|' r'and|' r'with|' r'meets|' r'adding|' r'hugs|' r'inspi\'|' r'a\.k\.a|' r'x|' r'×|' r'n\'|' r'vs\.?|' r'/|' r'\+|' r'&' r')\s+' ) LPAREN = r'(?<=\s)\(' RPAREN = r'\)(?=\s|,|\)|$)' CV = ( r'(' r'CV[.:]|' r'[Vv]ocals?:|' r'[Mm]ain\svocals?:|' r'[Cc]omposed\sby|' r'[Ff]rom|' r'[Ff]eat(\.|uring)?|' r'[Pp]erformed\sby|' r'Vo\.' r')\s+|=' ) COMMA = r',(\sand)?\s+' SPACE = r'\s+' ARTIST_COMPONENT = ( r'(' r'\(K\)NoW_NAME|' r'AKIMA & NEOS|' r'ANNA TSUCHIYA inspi\' NANA\(BLACK STONES\)|' r'Bird Bear Hare and Fish|' r'Bread & Butter|' r'Carole\s&\sTuesday|' r'Daisy x Daisy|' r'Dejo & Bon|' r'Digz, Inc. Group|' r'Dimitri From Paris|' r'Eunsol\(1008\)|' r'Fear,\sand\sLoathing\sin\sLas\sVegas|' r'HIGH and MIGHTY COLOR|' r'Hello, Happy World!|' r'Hifumi,inc\.|' r'Kamisama, Boku wa Kizuite shimatta|' r'Kevin & Cherry|' r'King & Queen|' r'Kisida Kyodan & The Akebosi Rockets|' r'Konya, Anomachikara|' r'Louis Armstrong and His Orchestra|' r'MYTH\s&\sROID|' r'OLIVIA inspi\' REIRA\(TRAPNEST\)|' r'Oranges\s(and|&)\sLemons|' r'Rough & Ready|' r'Run Girls, Run!|' r'Simon & Garfunkel|' r'Tackey & Tsubasa|' r'Takako & The Crazy Boys|' r'Voices From Mars|' r'Wake Up, [^\s]+!|' r'Yamagami Lucy \(…\)|' r'devils and realist|' r'*\(Asterisk\)|' r'[^\s=,()]+' r')' )
artist_lexer = ArtistLexer()
[docs] def handle_special_case(token) -> Iterable[ArtistChunk]: for startswith, split, collab_delim in SPECIAL_CASES: if token.value.startswith(startswith + split): primary, collaborators = token.value.split(split, 1) yield ArtistChunk(primary, is_artist=True) yield ArtistChunk(split, is_artist=False) for i, collaborator in enumerate(collaborators.split(collab_delim)): if i != 0: yield ArtistChunk(collab_delim, is_artist=False) yield ArtistChunk(collaborator, is_artist=True) break else: raise NotImplementedError(token.value)
[docs] def check_for_group(full_string: str, maybe_group_name: str) -> bool: remainder = full_string.replace(maybe_group_name, '', 1) if not remainder.startswith(' ('): return False paren_count = 0 for i, char in enumerate(remainder): if char == '(': paren_count += 1 elif char == ')': paren_count -= 1 if (paren_count == 0) and (i > 0) and (i < (len(remainder) - 1)): return False return paren_count == 0
[docs] def chunk_artist(string: str, fail_silently: bool = True) -> Iterable[ArtistChunk]: """ Return a bunch of :class:`ArtistChunk`\\ s which, when combined, reform the string handed in. """ # look i don't understand how sly works, and i think i might need to spend # like a week learning BNF if i want to use its Parser interface, and even # then i don't know that it'd help us here, so im just gonna use the lexer # and hack the rest of this: try: tokens = list(artist_lexer.tokenize(string)) except LexError as e: if fail_silently: if settings.DEBUG: print(f'problem parsing artist name {string!r}:\n {e}') yield ArtistChunk(text=string, is_artist=True) return else: raise e artist_parts = ('ARTIST_COMPONENT', 'SPACE') fragment: Optional[tuple[bool, str]] = None for ti, token in enumerate(tokens): if token.type == 'SPECIAL_CASE': yield from handle_special_case(token) continue is_part_of_artist_name = (token.type in artist_parts) and ( (token.type != 'SPACE') or ( # if this is a space, then: ( # be false if the next token isn't an artist component (ti + 1 < len(tokens)) and (tokens[ti + 1].type == 'ARTIST_COMPONENT') ) and ( # or if the previous one wasn't, either (ti > 0) and (tokens[ti - 1].type == 'ARTIST_COMPONENT') ) ) ) if fragment: if is_part_of_artist_name == fragment[0]: fragment = (fragment[0], fragment[1] + token.value) continue yield ArtistChunk(fragment[1], is_artist=fragment[0]) fragment = (is_part_of_artist_name, token.value) if fragment: yield ArtistChunk(fragment[1], is_artist=fragment[0])
[docs] def parse_artist(string: str, fail_silently: bool = True) -> ParsedArtist: if not string: return ParsedArtist(chunks=[], should_collapse=False) chunks = list(chunk_artist(string, fail_silently=fail_silently)) naive_is_group = check_for_group(string, chunks[0].text) return ParsedArtist( chunks=chunks, should_collapse=naive_is_group and len([chunk for chunk in chunks if chunk.is_artist]) > 2, )