import re
from dataclasses import dataclass
from typing import Iterable, Optional
from django.conf import settings
from django.urls import reverse
from sly import Lexer
from sly.lex import LexError
[docs]
@dataclass(frozen=True)
class ArtistChunk:
text: str
is_artist: bool
@property
def url(self) -> Optional[str]:
return (
reverse('vote:artist', kwargs={'artist': self.text})
if self.is_artist
else None
)
@property
def worth_linking_to(self) -> bool:
from .models import Track
return bool(self.is_artist and Track.objects.by_artist(self.text))
[docs]
@dataclass(frozen=True)
class ParsedArtist:
chunks: list[ArtistChunk]
should_collapse: bool
def __iter__(self) -> Iterable[ArtistChunk]:
return iter(self.chunks)
SPECIAL_CASES: list[tuple[str, str, str]] = [
('FLOW', 'x', '&'),
('SawanoHiroyuki[nZk]', ':', '&'),
('(K)NoW_NAME', ':', '&'),
]
_SPECIAL_CASE_TOKEN = r'^{}$'.format(
'|'.join(
r'({}.*)'.format(re.escape(startswith + split))
for startswith, split, collab_delim in SPECIAL_CASES
)
)
[docs]
class ArtistLexer(Lexer):
tokens = {
SPECIAL_CASE, # type: ignore # noqa
ARTIST_COMPONENT, # type: ignore # noqa
SPACE, # type: ignore # noqa
COMMA, # type: ignore # noqa
VIA, # type: ignore # noqa
LPAREN, # type: ignore # noqa
RPAREN, # type: ignore # noqa
CV, # type: ignore # noqa
}
SPECIAL_CASE = _SPECIAL_CASE_TOKEN
VIA = (
r'\s+('
r'from|'
r'ft\.|'
r'feat(\.|uring)?\.?|'
r'[Ss]tarring|'
r'and|'
r'with|'
r'meets|'
r'adding|'
r'hugs|'
r'inspi\'|'
r'a\.k\.a|'
r'x|'
r'×|'
r'n\'|'
r'vs\.?|'
r'/|'
r'\+|'
r'&'
r')\s+'
)
LPAREN = r'(?<=\s)\('
RPAREN = r'\)(?=\s|,|\)|$)'
CV = (
r'('
r'CV[.:]|'
r'[Vv]ocals?:|'
r'[Mm]ain\svocals?:|'
r'[Cc]omposed\sby|'
r'[Ff]rom|'
r'[Ff]eat(\.|uring)?|'
r'[Pp]erformed\sby|'
r'Vo\.'
r')\s+|='
)
COMMA = r',(\sand)?\s+'
SPACE = r'\s+'
ARTIST_COMPONENT = (
r'('
r'\(K\)NoW_NAME|'
r'AKIMA & NEOS|'
r'ANNA TSUCHIYA inspi\' NANA\(BLACK STONES\)|'
r'Bird Bear Hare and Fish|'
r'Bread & Butter|'
r'Carole\s&\sTuesday|'
r'Daisy x Daisy|'
r'Dejo & Bon|'
r'Digz, Inc. Group|'
r'Dimitri From Paris|'
r'Eunsol\(1008\)|'
r'Fear,\sand\sLoathing\sin\sLas\sVegas|'
r'HIGH and MIGHTY COLOR|'
r'Hello, Happy World!|'
r'Hifumi,inc\.|'
r'Kamisama, Boku wa Kizuite shimatta|'
r'Kevin & Cherry|'
r'King & Queen|'
r'Kisida Kyodan & The Akebosi Rockets|'
r'Konya, Anomachikara|'
r'Louis Armstrong and His Orchestra|'
r'MYTH\s&\sROID|'
r'OLIVIA inspi\' REIRA\(TRAPNEST\)|'
r'Oranges\s(and|&)\sLemons|'
r'Rough & Ready|'
r'Run Girls, Run!|'
r'Simon & Garfunkel|'
r'Tackey & Tsubasa|'
r'Takako & The Crazy Boys|'
r'Voices From Mars|'
r'Wake Up, [^\s]+!|'
r'Yamagami Lucy \(…\)|'
r'devils and realist|'
r'*\(Asterisk\)|'
r'[^\s=,()]+'
r')'
)
artist_lexer = ArtistLexer()
[docs]
def handle_special_case(token) -> Iterable[ArtistChunk]:
for startswith, split, collab_delim in SPECIAL_CASES:
if token.value.startswith(startswith + split):
primary, collaborators = token.value.split(split, 1)
yield ArtistChunk(primary, is_artist=True)
yield ArtistChunk(split, is_artist=False)
for i, collaborator in enumerate(collaborators.split(collab_delim)):
if i != 0:
yield ArtistChunk(collab_delim, is_artist=False)
yield ArtistChunk(collaborator, is_artist=True)
break
else:
raise NotImplementedError(token.value)
[docs]
def check_for_group(full_string: str, maybe_group_name: str) -> bool:
remainder = full_string.replace(maybe_group_name, '', 1)
if not remainder.startswith(' ('):
return False
paren_count = 0
for i, char in enumerate(remainder):
if char == '(':
paren_count += 1
elif char == ')':
paren_count -= 1
if (paren_count == 0) and (i > 0) and (i < (len(remainder) - 1)):
return False
return paren_count == 0
[docs]
def chunk_artist(string: str, fail_silently: bool = True) -> Iterable[ArtistChunk]:
"""
Return a bunch of :class:`ArtistChunk`\\ s which, when combined, reform the
string handed in.
"""
# look i don't understand how sly works, and i think i might need to spend
# like a week learning BNF if i want to use its Parser interface, and even
# then i don't know that it'd help us here, so im just gonna use the lexer
# and hack the rest of this:
try:
tokens = list(artist_lexer.tokenize(string))
except LexError as e:
if fail_silently:
if settings.DEBUG:
print(f'problem parsing artist name {string!r}:\n {e}')
yield ArtistChunk(text=string, is_artist=True)
return
else:
raise e
artist_parts = ('ARTIST_COMPONENT', 'SPACE')
fragment: Optional[tuple[bool, str]] = None
for ti, token in enumerate(tokens):
if token.type == 'SPECIAL_CASE':
yield from handle_special_case(token)
continue
is_part_of_artist_name = (token.type in artist_parts) and (
(token.type != 'SPACE')
or (
# if this is a space, then:
(
# be false if the next token isn't an artist component
(ti + 1 < len(tokens))
and (tokens[ti + 1].type == 'ARTIST_COMPONENT')
)
and (
# or if the previous one wasn't, either
(ti > 0)
and (tokens[ti - 1].type == 'ARTIST_COMPONENT')
)
)
)
if fragment:
if is_part_of_artist_name == fragment[0]:
fragment = (fragment[0], fragment[1] + token.value)
continue
yield ArtistChunk(fragment[1], is_artist=fragment[0])
fragment = (is_part_of_artist_name, token.value)
if fragment:
yield ArtistChunk(fragment[1], is_artist=fragment[0])
[docs]
def parse_artist(string: str, fail_silently: bool = True) -> ParsedArtist:
if not string:
return ParsedArtist(chunks=[], should_collapse=False)
chunks = list(chunk_artist(string, fail_silently=fail_silently))
naive_is_group = check_for_group(string, chunks[0].text)
return ParsedArtist(
chunks=chunks,
should_collapse=naive_is_group
and len([chunk for chunk in chunks if chunk.is_artist]) > 2,
)