From 6f4fd350d780a4b7c49b63bd78b58fcb1b1d840a Mon Sep 17 00:00:00 2001 From: Nathan Downs Date: Wed, 3 Apr 2019 17:53:39 -0700 Subject: [PATCH] Remove a large swath of unmaintained emoji regex generation code --- 2/scripts/generate | 451 +-------------------------------------------- 1 file changed, 6 insertions(+), 445 deletions(-) diff --git a/2/scripts/generate b/2/scripts/generate index 5d6bf726..390d0931 100755 --- a/2/scripts/generate +++ b/2/scripts/generate @@ -15,450 +15,7 @@ function file(which) { return path.join(__dirname, '../..', which); } -// Twitter assets by property name -var assets = { - '2/72x72': [], - '2/svg': [] -}; - -var skinToneOptions = [ - '\\ud83c\\udffb', - '\\ud83c\\udffc', - '\\ud83c\\udffd', - '\\ud83c\\udffe', - '\\ud83c\\udfff' -]; - -// white spaces we don't want to catch via the RegExp -// there is no asset equivalent for these -var ignoreMissing = ['2002', '2003', '2005']; - -// Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"] -// items get sorted by length (long to short), then unicode hex values (low to high) -// output is "or" ed together using | for regex -// ouput also combines adjacent items using character classes with ranges when they have common prefixes -// Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba" -function generateRegexPartial(items) { - var currentPrefix = null; - var result = []; - var charClass = []; - var charRange = []; - items.map(function (item) { - // Convert from "\u2963\ufe0f" into ["2963", "fe0f"] - return item.split('\\u').slice(1); - }).sort(sortMethod).forEach(function (itemParts) { - var prefix = itemParts.slice(0, -1).join('\\u'); - if (prefix) { - prefix = '\\u' + prefix; - } - var suffix = itemParts.slice(-1); - if (prefix !== currentPrefix) { - flushCharClass(); - } - currentPrefix = prefix; - var suffixMinusOne = Utils.UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1)); - - if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) { - flushCharRange(); - } - charRange.push('\\u' + suffix); - }); - - flushCharClass(); - return result.join('|'); - - // a and b are arrays of hex UCS-2 units - function sortMethod(a, b) { - return !a.length ? 0 : - b.length - a.length || - parseInt(a[0], 16) - parseInt(b[0], 16) || - sortMethod(a.slice(1), b.slice(1) - ); - } - - function flushCharRange() { - charClass = charClass.concat((charRange.length < 3) ? - charRange : - [ charRange[0], '-', charRange.slice(-1)[0] ] - ); - charRange = []; - } - - function flushCharClass() { - flushCharRange(); - if (charClass.length) { - result.push(currentPrefix + (charClass.length == 1 ? - charClass[0] : - '[' + charClass.join('') + ']' - )); - } - charClass = []; - currentPrefix = null; - } -} - -// basic utility to organize async code -// see: http://webreflection.blogspot.co.uk/2012/03/tweet-sized-queue-system.html -// or: http://webreflection.blogspot.co.uk/2012/06/working-with-queues.html -function Queue(args, f) { - setTimeout(args.next = function next() { - return (f = args.shift()) ? !!f(args) || !0 : !1; - }, 0); - return args; -} - -// main task -Queue([ - - // will populate assets arrays - function grabAllAssets(q) { - console.log('analyzing all assets ... '); - // per each path/folder - Object.keys(assets).forEach(function (path, i, paths) { - // grab all files in that folder - fs.readdir(file(path), function (err, files) { - // and add them to the assets path - assets[path].push.apply( - assets[path], - files.map(upperCaseWithoutExtension) - ); - // once all assets arrays have been populated - if (paths.reduce(completed, true)) { - console.log('[INFO] assets contains ' + assets[path].length + ' emoji.'); - q.next(); - } - }); - }); - // drop extension + uppercase - function upperCaseWithoutExtension(file) { - return file.slice(0, file.lastIndexOf('.')).toUpperCase(); - } - // returns true if all assets have been populated - function completed(p, c) { - return p && assets[c].length; - } - }, - - // will fetch and store all emoji from unicode.org - function fetchEmojiSources(q) { - console.log('fetching EmojiSources.txt ... '); - // grab all emoji and test them against them - http.get("http://www.unicode.org/Public/UNIDATA/EmojiSources.txt", function (res) { - var chunks = []; - // if all good ... - if (res.statusCode === 200) { - // grab all data - res.on('data', chunks.push.bind(chunks)); - // once done ... - res.on('end', function () { - console.log('analyzing EmojiSources VS our assets ... '); - // store all missing assets in one object - var missing = {}; - // will be used to store an array with all missing - var missingGrouped = {}; - - // will be needed later on - // parse it, clean it, and store it once - q.emojiSource = chunks - .join('') - .split(/\r\n|\r|\n/) - // filter once - .filter(function (line) { - return this.test(line); - }, /^[0-9A-F]/) - // take only emoji info - .map(function (codePoint) { - return codePoint - .slice(0, codePoint.indexOf(';')) - .toUpperCase() - // drop spaces - .replace(/\s+/g, '-') - // drop 0 padded prefixes - .replace(/^0+/g, ''); - }); - - console.log('[INFO] parsed ' + q.emojiSource.length + ' standard emoji.'); - - // find out which one is missing from our assets - q.emojiSource.forEach( - function (emoji) { - // do not loop for emoji we know we should ignore - if (ignoreMissing.indexOf(emoji) < 0) { - // verify all others per each folder - this.forEach(function (path) { - if (assets[path].indexOf(emoji) < 0) { - (missing[path] || (missing[path] = [])).push(emoji); - missingGrouped[emoji] = true; - } - }); - } - }, - // and per each folder - Object.keys(assets) - ); - - // if some missing emoji has been found - if (Object.keys(missing).length) { - // warn and show which one is missing - console.warn('[WARNING] missing assets for:'); - console.log(missing); - } - // create the array of all emoji we should ignore - q.ignore = ignoreMissing.concat(Object.keys(missingGrouped)); - - q.next(); - }); - } else { - console.error('[ERROR] unable to fetch emoji at unicode.org'); - process.exit(1); - } - }); - }, - - // grab the list of emoji that behave differently when - // variants such \uFE0E and \uFE0F are in place - function grabStandardVariants(q) { - console.log('fetching StandardizedVariants.txt ... '); - http.get( - "http://unicode.org/Public/UNIDATA/StandardizedVariants.txt", - function(res) { - var chunks = []; - if (res.statusCode == 200) { - res.on('data', chunks.push.bind(chunks)); - res.on('end', function () { - // cleaning up parsing sensitive emoji - q.variantsSensitive = chunks - .join('') // all content - .split(/\r\n|\r|\n/) // split in lines - .filter(function (line) { // containing FE0E; info - return this.test(line); // avoiding duplicated with FE0F - }, / FE0E; text style/) - .map(function (line) { // cleaned up to grab - return line.replace(this, '$1') // only first unicode - .toUpperCase(); // normalized as uppercase - }, /^([0-9A-F]{4,}) FE0E;.+$/) // sensitive char - ; - - // iOS keyboard allows U+002A U+FE0F U+20E3 even though not a standardized variant (yet?) - q.variantsSensitive.push('002A'); - // iOS keyboard allows U+2639 U+FE0F even though not a standardized variant (yet?) - q.variantsSensitive.push('2639'); - - console.log('[INFO] parsed ' + q.variantsSensitive.length + ' variant sensitive emoji.'); - q.next(); - - }); - } else { - console.error('[ERROR] unable to fetch standard variants at unicode.org'); - process.exit(1); - } - } - ); - }, - - // add our own assets that are not part of the Unicode standard - function addMissingEmoji(q) { - q.nonStandard = []; - Object.keys(assets).forEach(function (path, i) { - assets[path].forEach(function (emoji) { - if ( - q.emojiSource.indexOf(emoji) < 0 && - q.nonStandard.indexOf(emoji) < 0 - ) { - q.nonStandard.push(emoji); - } - }); - }); - - if (q.nonStandard.length) { - console.warn('[WARNING] assets contain ' + q.nonStandard.length + ' non standard emoji:'); - // console.log(q.nonStandard.join(', ')); - } - - q.emojiSource = q.emojiSource.concat(q.nonStandard) - q.next(); - }, - - // detect complete sets of five skin tones and a base - function detectDiversityEmoji(q) { - var isPresent = {}; - q.emojiSource.forEach(function (codePoints) { - isPresent[codePoints] = true; - }); - q.diversityBase = q.emojiSource.filter(function (codePoints) { - // Start with the set of Emoji with the light skin tone - return /-1F3FB$/.test(codePoints); - }).map(function (codePoints) { - // Take the skin tone off - return codePoints.replace(/-1F3FB$/, ''); - }).filter(function (baseCodePoints) { - // Verify that all other skin tones + no skin tone are present - return ['-1F3FC', '-1F3FD', '-1F3FE', '-1F3FF', ''].every(function (suffix) { - return isPresent[baseCodePoints + suffix]; - }); - }); - console.log('[INFO] parsed ' + q.diversityBase.length + ' diversity emoji.'); - q.next(); - }, - - // detect complete sets of five skin tones and a base - function partitionEmojiTypes(q) { - console.log('partitioning emoji into types'); - q.zwj = []; - q.diversity = []; - q.sensitive = []; - q.sensitiveKeycaps = []; - q.diversitySensitive = []; - q.regular = []; - q.emojiSource.forEach(function (codePoints) { - var u; - var codePointsWithoutKeycap; - codePoints = codePoints.replace(/\b[A-F0-9]+\b/g, function (hex) { - // Pad all hex numbers to have at least 4 digits to match variantsSensitive - return hex.length < 4 ? ('000' + hex).slice(-4) : hex; - }); - if (q.ignore.indexOf(codePoints) < 0) { - u = Utils.toJSON(codePoints); - codePointsWithoutKeycap = codePoints.replace(/-20E3$/, ''); - if (codePoints.indexOf('200D') >= 0) { - q.zwj.push(u); - } else if (codePoints != codePointsWithoutKeycap && q.variantsSensitive.indexOf(codePointsWithoutKeycap) >= 0) { - q.sensitiveKeycaps.push(Utils.toJSON(codePointsWithoutKeycap)); - } else if (q.diversityBase.indexOf(codePoints.replace(/-1F3F[B-F]$/, '')) >= 0) { - // This is a diversity Emoji with or without a skin tone modifier - // Add it to the regex if this is the base without the modifier - if (q.diversityBase.indexOf(codePoints) >= 0) { - if (q.variantsSensitive.indexOf(codePoints) < 0) { - q.diversity.push(u); - } else { - q.diversitySensitive.push(u); - } - } - } else if (q.variantsSensitive.indexOf(codePoints) < 0) { - q.regular.push(u); - } else { - q.sensitive.push(u); - } - } - }); - q.next(); - }, - - function factorZwjSequences(q) { - q.zwjCommonPatterns = []; - - // There are dozens of new ZWJ sequences that have common prefixes or suffixes with - // skin tone + gender variations. To keep the main regex from growing excessively large and - // slow, choose some common sub-expressions to factor. - var commonPatterns = [ - { - name: 'leading man/woman zwj with optional skin tone', - re: '\\ud83d[\\udc68-\\udc69](?:\\ud83c[\\udffb-\\udfff])?\\u200d(.+?)', - numCombinations: 12 - }, { - name: 'variant or skin tone before trailing female/male zwj', - re: '(.+?)(?:\\ufe0f|\\ud83c[\\udffb-\\udfff])\\u200d[\\u2640\\u2642]\\ufe0f', - numCombinations: 12 - }, { - name: 'optional skin tone before trailing female/male zwj', - re: '(.+?)(?:\\ud83c[\\udffb-\\udfff])?\\u200d[\\u2640\\u2642]\\ufe0f', - numCombinations: 12 - } - ]; - - commonPatterns.forEach(function(pattern) { - var mapOfMatches = {}; - var re = new RegExp('^' + pattern.re + '$'); - q.zwj.forEach(function(jsonString) { - var rawString = JSON.parse('"' + jsonString + '"'); - var match = rawString.match(re); - if (match) { - var key = match[1]; - mapOfMatches[key] = mapOfMatches[key] || []; - mapOfMatches[key].push(match[0]); - } - }); - var replacements = []; - Object.keys(mapOfMatches).forEach(function(key) { - var matches = mapOfMatches[key]; - // Only a complete set may be replaced - if (matches.length === pattern.numCombinations) { - replacements.push(Utils.UTF162JSON(key)); - // Remove all items in the match set from the original zwj list - matches.forEach(function(rawString) { - var indexToRemove = q.zwj.indexOf(Utils.UTF162JSON(rawString)); - if (indexToRemove >= 0) { - q.zwj.splice(indexToRemove, 1); - } - }); - } - }); - if (replacements.length) { - // Replace the wildcard section of the regex with a regex group of replacements - var re = pattern.re.replace('(.+?', '(?:' + generateRegexPartial(replacements)); - q.zwjCommonPatterns.push(re); - console.log('Refactoring ' + replacements.length + ' complete sets of ' + pattern.numCombinations + ' zwj from ' + pattern.name); - } else { - console.log('did not find any complete sets of ' + pattern.name); - } - }); - - q.next(); - }, - - // with all info, generate a RegExp that will catch - // only standard emoji that are present in our assets - function generateRegExp(q) { - console.log('generating a RegExp for available assets'); - q.re = ''; - - // The Zero-width joiner common patterns, if present, need to come first - if (q.zwjCommonPatterns.length) { - q.re += q.zwjCommonPatterns.join('|') + '|'; - } - - // Then the rest of the zwjs - if (q.zwj.length) { - q.re += generateRegexPartial(q.zwj) + '|'; - } - - // Group the variant sensitive keycaps - if (q.sensitiveKeycaps.length) { - q.re += '(?:' + generateRegexPartial(q.sensitiveKeycaps) + ')\\ufe0f?\\u20e3|'; - } - - // Next, add the diversity enabled Emoji that may include a skin tone suffix - if (q.diversity.length + q.diversitySensitive.length) { - q.re += '(?:'; - if (q.diversitySensitive.length) { - // Some diversity are sensitive to variants - q.re += '(?:' + generateRegexPartial(q.diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))'; - if (q.diversity.length) { - q.re += '|'; - } - } - q.re += generateRegexPartial(q.diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|'; - } - - // Next, the normal Emoji - q.re += generateRegexPartial(q.regular) + '|'; - - // Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E - q.re += '(?:' + generateRegexPartial(q.sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))'; - q.next(); - }, - - function generateFile(q) { - console.log('generating ./twemoji.js'); - createTwemoji(q.re); - require('./create-dist'); - } - -]); - - - -function createTwemoji(re) { +function createTwemoji() { fs.writeFileSync( file('2/twemoji.js'), '/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' + @@ -1042,4 +599,8 @@ function createTwemoji(re) { ) + '\n */' ) + '());'); -} \ No newline at end of file + +} + +createTwemoji(); +require('./create-dist');