Cleaning-up Persian Texts!
This script should not be not be installed directly. It is a library for other scripts to include with the meta directive // @require https://update.greasyfork.org/scripts/527228/1538801/Virastar%20Library.js
// ==UserScript== // @name Virastar Library // @version 0.21.0 // @description Cleaning-up Persian Texts! // @homepage https://github.com/brothersincode/virastar/ // @namespace amm1rr.com.virastar // @name:fa کتابخانه ویراستار // @description:fa ویراستار متنِ فارسی // @grant none // @updateURL https://raw.githubusercontent.com/brothersincode/virastar/master/lib/virastar.js // @downloadURL https://raw.githubusercontent.com/brothersincode/virastar/master/lib/virastar.js // @license MIT // ==/UserScript== /*! * Virastar - v0.21.0 - 2020-05-14 * https://github.com/brothersincode/virastar * Licensed: MIT */ (function (name, global, definition) { if (typeof module !== 'undefined') module.exports = definition(); else if (typeof define === 'function' && typeof define.amd === 'object') define(definition); else if (typeof window !== 'undefined') window[name] = definition(); else global[name] = definition(); }('Virastar', this, function () { function Virastar (text, options) { if (!(this instanceof Virastar)) { return new Virastar(text, options); } text = text || {}; if (typeof text === 'object') { this.opts = parseOptions(text); } else if (typeof text === 'string') { this.opts = parseOptions(options || {}); return cleanup(text); } return this; } function parseOptions (options) { // @ref: https://scotch.io/bar-talk/copying-objects-in-javascript var parsed = Object.assign({}, defaults); for (var i in parsed) { if (options.hasOwnProperty(i)) { // eslint-disable-line no-prototype-builtins parsed[i] = options[i]; } } return parsed; } function charReplace (text, fromBatch, toBatch) { var fromChars = fromBatch.split(''); var toChars = toBatch.split(''); for (var i in fromChars) { text = text.replace(newRegExp(fromChars[i]), toChars[i]); } return text; } function arrReplace (text, array) { for (var i in array) { if (array.hasOwnProperty(i)) { // eslint-disable-line no-prototype-builtins text = text.replace(newRegExp('[' + array[i] + ']'), i); } } return text; } function newRegExp (pattern, flags) { return new RegExp(pattern, flags || 'g'); } var charsPersian = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك'; // @REF: https://en.wikipedia.org/wiki/Persian_alphabet#Diacritics // `\u064e\u0650\u064f\u064b\u064d\u064c\u0651\u06c0` var charsDiacritic = 'ًٌٍَُِّْ'; // @source: https://github.com/jhermsmeier/uri.regex var patternURI = "([A-Za-z][A-Za-z0-9+\\-.]*):(?:(//)(?:((?:[A-Za-z0-9\\-._~!$&'()*+,;=:]|%[0-9A-Fa-f]{2})*)@)?((?:\\[(?:(?:(?:(?:[0-9A-Fa-f]{1,4}:){6}|::(?:[0-9A-Fa-f]{1,4}:){5}|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}|(?:(?:[0-9A-Fa-f]{1,4}:){0,1}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}|(?:(?:[0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}|(?:(?:[0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:|(?:(?:[0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})?::)(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})?::)|[Vv][0-9A-Fa-f]+\\.[A-Za-z0-9\\-._~!$&'()*+,;=:]+)\\]|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:[A-Za-z0-9\\-._~!$&'()*+,;=]|%[0-9A-Fa-f]{2})*))(?::([0-9]*))?((?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)|/((?:(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})+(?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)?)|((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})+(?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)|)(?:\\?((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*))?(?:\\#((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*))?"; var patternAfter = '\\s.,;،؛!؟?"\'()[\\]{}“”«»'; var defaults = { // aggresive: true, // DEPRECATED cleanup_begin_and_end: true, cleanup_extra_marks: true, cleanup_kashidas: true, cleanup_line_breaks: true, cleanup_rlm: true, cleanup_spacing: true, cleanup_zwnj: true, decode_htmlentities: true, fix_arabic_numbers: true, fix_dashes: true, fix_diacritics: true, fix_english_numbers: true, fix_english_quotes_pairs: true, fix_english_quotes: true, fix_hamzeh: true, fix_hamzeh_arabic: false, fix_misc_non_persian_chars: true, fix_misc_spacing: true, fix_numeral_symbols: true, fix_perfix_spacing: true, fix_persian_glyphs: true, fix_punctuations: true, fix_question_mark: true, fix_spacing_for_braces_and_quotes: true, fix_spacing_for_punctuations: true, fix_suffix_misc: true, fix_suffix_spacing: true, fix_three_dots: true, kashidas_as_parenthetic: true, markdown_normalize_braces: true, markdown_normalize_lists: true, normalize_dates: true, normalize_ellipsis: true, normalize_eol: true, preserve_braces: false, preserve_brackets: false, preserve_comments: true, preserve_entities: true, preserve_frontmatter: true, preserve_HTML: true, preserve_nbsps: true, preserve_URIs: true, remove_diacritics: false, skip_markdown_ordered_lists_numbers_conversion: false }; var digits = '۱۲۳۴۵۶۷۸۹۰'; /* eslint-disable */ var entities = { 'sbquo;': '\u201a', 'lsquo;': '\u2018', 'lsquor;': '\u201a', 'ldquo;': '\u201c', 'ldquor;': '\u201e', 'rdquo;': '\u201d', 'rdquor;': '\u201d', 'rsquo;': '\u2019', 'rsquor;': '\u2019', 'apos;': '\'', 'QUOT;': '"', 'QUOT': '"', 'quot;': '"', 'quot': '"', 'zwj;': '\u200d', 'ZWNJ;': '\u200c', 'zwnj;': '\u200c', 'shy;': '\u00ad' // wrongly used as zwnj }; // props @ebraminio/persiantools var glyphs = { // these two are for visually available ZWNJ #visualZwnj '\u200cه': 'ﻫ', 'ی\u200c': 'ﻰﻲ', 'ﺃ': 'ﺄﺃ', 'ﺁ': 'ﺁﺂ', 'ﺇ': 'ﺇﺈ', 'ا': 'ﺎا', 'ب': 'ﺏﺐﺑﺒ', 'پ': 'ﭖﭗﭘﭙ', 'ت': 'ﺕﺖﺗﺘ', 'ث': 'ﺙﺚﺛﺜ', 'ج': 'ﺝﺞﺟﺠ', 'چ': 'ﭺﭻﭼﭽ', 'ح': 'ﺡﺢﺣﺤ', 'خ': 'ﺥﺦﺧﺨ', 'د': 'ﺩﺪ', 'ذ': 'ﺫﺬ', 'ر': 'ﺭﺮ', 'ز': 'ﺯﺰ', 'ژ': 'ﮊﮋ', 'س': 'ﺱﺲﺳﺴ', 'ش': 'ﺵﺶﺷﺸ', 'ص': 'ﺹﺺﺻﺼ', 'ض': 'ﺽﺾﺿﻀ', 'ط': 'ﻁﻂﻃﻄ', 'ظ': 'ﻅﻆﻇﻈ', 'ع': 'ﻉﻊﻋﻌ', 'غ': 'ﻍﻎﻏﻐ', 'ف': 'ﻑﻒﻓﻔ', 'ق': 'ﻕﻖﻗﻘ', 'ک': 'ﮎﮏﮐﮑﻙﻚﻛﻜ', 'گ': 'ﮒﮓﮔﮕ', 'ل': 'ﻝﻞﻟﻠ', 'م': 'ﻡﻢﻣﻤ', 'ن': 'ﻥﻦﻧﻨ', 'ه': 'ﻩﻪﻫﻬ', 'هٔ': 'ﮤﮥ', 'و': 'ﻭﻮ', 'ﺅ': 'ﺅﺆ', 'ی': 'ﯼﯽﯾﯿﻯﻰﻱﻲﻳﻴ', 'ئ': 'ﺉﺊﺋﺌ', 'لا': 'ﻼ', 'ﻹ': 'ﻺ', 'ﻷ': 'ﻸ', 'ﻵ': 'ﻶ' }; /* eslint-enable */ function cleanup (text, options) { if (typeof text !== 'string') { throw new TypeError('Expected a String, but received ' + typeof text); } // dont bother if its empty or whitespace if (!text.trim()) { return text; } var opts = options ? parseOptions(options) : this.opts; // single space paddings around the string text = ' ' + text + ' '; // preserves frontmatter data in the text if (opts.preserve_frontmatter) { var frontmatter = []; text = text.replace(/^ ---[\S\s]*?---\n/g, function (matched) { frontmatter.push(matched); return ' __FRONTMATTER__PRESERVER__ '; }); } // preserves all html tags in the text // @props: @wordpress/wordcount if (opts.preserve_HTML) { var html = []; text = text.replace(/<\/?[a-z][^>]*?>/gi, function (matched) { html.push(matched); return ' __HTML__PRESERVER__ '; }); } // preserves all html comments in the text // @props: @wordpress/wordcount if (opts.preserve_comments) { var comments = []; text = text.replace(/<!--[\s\S]*?-->/g, function (matched) { comments.push(matched); return ' __COMMENT__PRESERVER__ '; }); } // preserves strings inside square brackets (`[]`) if (opts.preserve_brackets) { var brackets = []; text = text.replace(/(\[.*?\])/g, function (matched) { brackets.push(matched); return ' __BRACKETS__PRESERVER__ '; }); } // preserves strings inside curly braces (`{}`) if (opts.preserve_braces) { var braces = []; text = text.replace(/(\{.*?\})/g, function (matched) { braces.push(matched); return ' __BRACES__PRESERVER__ '; }); } // preserves all uri strings in the text if (opts.preserve_URIs) { var mdlinks = []; var uris = []; // stores markdown links separetly text = text.replace(/]\((.*?)\)/g, function (matched, link) { if (link) { mdlinks.push(link.trim()); return '](__MD_LINK__PRESERVER__)'; // no padding! } return matched; }); text = text.replace(newRegExp(patternURI), function (matched) { uris.push(matched); return ' __URI__PRESERVER__ '; }); } // preserves all no-break space entities in the text if (opts.preserve_nbsps) { var nbsps = []; text = text.replace(/ | /gi, function (matched) { nbsps.push(matched); return ' __NBSPS__PRESERVER__ '; }); } if (opts.decode_htmlentities) { text = decodeHTMLEntities(text); } // preserves all html entities in the text // @props: @substack/node-ent if (opts.preserve_entities) { var entities = []; text = text.replace(/&(#?[^;\W]+;?)/g, function (matched) { entities.push(matched); return ' __ENTITIES__PRESERVER__ '; }); } if (opts.normalize_eol) { text = normalizeEOL(text); } if (opts.fix_persian_glyphs) { text = fixPersianGlyphs(text); } if (opts.fix_dashes) { text = fixDashes(text); } if (opts.fix_three_dots) { text = fixThreeDots(text); } if (opts.normalize_ellipsis) { text = normalizeEllipsis(text); } if (opts.fix_english_quotes_pairs) { text = fixEnglishQuotesPairs(text); } if (opts.fix_english_quotes) { text = fixEnglishQuotes(text); } if (opts.fix_hamzeh) { if (opts.fix_hamzeh_arabic) { text = fixHamzehArabic(text); } text = fixHamzeh(text); } else if (opts.fix_suffix_spacing) { if (opts.fix_hamzeh_arabic) { text = fixHamzehArabicAlt(text); } text = fixSuffixSpacingHamzeh(text); } if (opts.cleanup_rlm) { text = cleanupRLM(text); } if (opts.cleanup_zwnj) { text = cleanupZWNJ(text); } if (opts.fix_arabic_numbers) { text = fixArabicNumbers(text); } // word tokenizer text = text.replace(/(^|\s+)([[({"'“«]?)(\S+)([\])}"'”»]?)(?=($|\s+))/g, function (matched, before, leadings, word, trailings, after) { // should not replace to persian chars in english phrases if (word.match(/[a-zA-Z\-_]{2,}/g)) { return matched; } // should not touch sprintf directives // @source: https://stackoverflow.com/a/8915445/ if (word.match(/%(?:\d+\$)?[+-]?(?:[ 0]|'.{1})?-?\d*(?:\.\d+)?[bcdeEufFgGosxX]/g)) { return matched; } // should not touch numbers in html entities if (word.match(/&#\d+;/g)) { return matched; } // skips converting english numbers of ordered lists in markdown if (opts.skip_markdown_ordered_lists_numbers_conversion && (matched + trailings + after).match(/(?:(?:\r?\n)|(?:\r\n?)|(?:^|\n))\d+\.\s/)) { return matched; } if (opts.fix_english_numbers) { matched = fixEnglishNumbers(matched); } if (opts.fix_numeral_symbols) { matched = fixNumeralSymbols(matched); } if (opts.fix_punctuations) { matched = fixPunctuations(matched); } if (opts.fix_misc_non_persian_chars) { matched = fixMiscNonPersianChars(matched); } if (opts.fix_question_mark) { matched = fixQuestionMark(matched); } return matched; } ); if (opts.normalize_dates) { text = normalizeDates(text); } if (opts.fix_perfix_spacing) { text = fixPerfixSpacing(text); } if (opts.fix_suffix_spacing) { text = fixSuffixSpacing(text); } if (opts.fix_suffix_misc) { text = fixSuffixMisc(text); } if (opts.fix_spacing_for_braces_and_quotes) { text = fixBracesSpacing(text); } if (opts.cleanup_extra_marks) { text = cleanupExtraMarks(text); } if (opts.fix_spacing_for_punctuations) { text = fixPunctuationSpacing(text); } if (opts.kashidas_as_parenthetic) { text = kashidasAsParenthetic(text); } if (opts.cleanup_kashidas) { text = cleanupKashidas(text); } if (opts.markdown_normalize_braces) { text = markdownNormalizeBraces(text); } if (opts.markdown_normalize_lists) { text = markdownNormalizeLists(text); } // doing it again after `fixPunctuationSpacing()` if (opts.fix_spacing_for_braces_and_quotes) { text = fixBracesSpacingInside(text); } if (opts.fix_misc_spacing) { text = fixMiscSpacing(text); } if (opts.remove_diacritics) { text = removeDiacritics(text); } else if (opts.fix_diacritics) { text = fixDiacritics(text); } if (opts.cleanup_spacing) { text = cleanupSpacing(text); } if (opts.cleanup_zwnj) { text = cleanupZWNJLate(text); } if (opts.cleanup_line_breaks) { text = cleanupLineBreaks(text); } // bringing back entities if (opts.preserve_entities) { text = text.replace(/[ ]?__ENTITIES__PRESERVER__[ ]?/g, function () { return entities.shift(); }); } // bringing back nbsps if (opts.preserve_nbsps) { text = text.replace(/[ ]?__NBSPS__PRESERVER__[ ]?/g, function () { return nbsps.shift(); }); } // bringing back URIs if (opts.preserve_URIs) { // no padding! text = text.replace(/__MD_LINK__PRESERVER__/g, function () { return mdlinks.shift(); }); text = text.replace(/[ ]?__URI__PRESERVER__[ ]?/g, function () { return uris.shift(); }); } // bringing back braces if (opts.preserve_braces) { text = text.replace(/[ ]?__BRACES__PRESERVER__[ ]?/g, function () { return braces.shift(); }); } // bringing back brackets if (opts.preserve_brackets) { text = text.replace(/[ ]?__BRACKETS__PRESERVER__[ ]?/g, function () { return brackets.shift(); }); } // bringing back HTML comments if (opts.preserve_comments) { text = text.replace(/[ ]?__COMMENT__PRESERVER__[ ]?/g, function () { return comments.shift(); }); } // bringing back HTML tags if (opts.preserve_HTML) { text = text.replace(/[ ]?__HTML__PRESERVER__[ ]?/g, function () { return html.shift(); }); } // bringing back frontmatter if (opts.preserve_frontmatter) { text = text.replace(/[ ]?__FRONTMATTER__PRESERVER__[ ]?/g, function () { return frontmatter.shift(); }); } if (opts.cleanup_begin_and_end) { text = cleanupBeginAndEnd(text); } else { // removes single space paddings around the string text = text.replace(/^[ ]/g, '').replace(/[ ]$/g, ''); } return text; } // props @ebraminio/persiantools function cleanupZWNJ (text) { return text // converts all soft hyphens (­) into zwnj .replace(/\u00ad/g, '\u200c') // removes more than one zwnj .replace(/\u200c{2,}/g, '\u200c') // cleans zwnj before and after numbers, english words, spaces and punctuations .replace(/\u200c([\w\s0-9۰-۹[\](){}«»“”.…,:;?!$%@#*=+\-/\\،؛٫٬×٪؟ـ])/g, '$1') .replace(/([\w\s0-9۰-۹[\](){}«»“”.…,:;?!$%@#*=+\-/\\،؛٫٬×٪؟ـ])\u200c/g, '$1') // removes unnecessary zwnj on start/end of each line .replace(/(^\u200c|\u200c$)/gm, '') ; } // late checks for zwnjs function cleanupZWNJLate (text) { return text // cleans zwnj after characters that don't conncet to the next .replace(/([إأةؤورزژاآدذ،؛,:«»\\/@#$٪×*()ـ\-=|])\u200c/g, '$1') ; } // converts numeral and selected html character-sets into original characters // @props: @substack/node-ent function decodeHTMLEntities (text) { return text.replace(/&(#?[^;\W]+;?)/g, function (matched, match) { var n; if ((n = /^#(\d+);?$/.exec(match))) { return String.fromCharCode(parseInt(n[1], 10)); } else if ((n = /^#[Xx]([A-Fa-f0-9]+);?/.exec(match))) { return String.fromCharCode(parseInt(n[1], 16)); } else { var hasSemi = /;$/.test(match); var withoutSemi = hasSemi ? match.replace(/;$/, '') : match; var target = entities[withoutSemi] || (hasSemi && entities[match]); if (typeof target === 'number') { return String.fromCharCode(target); } else if (typeof target === 'string') { return target; } else { return '&' + match; } } }); } function normalizeEOL (text) { return text // replaces windows end of lines with unix eol (`\n`) .replace(/(\r?\n)|(\r\n?)/g, '\n') ; } function fixDashes (text) { return text // replaces triple dash to mdash .replace(/-{3}/g, '—') // replaces double dash to ndash .replace(/-{2}/g, '–') ; } function fixThreeDots (text) { return text // removes spaces between dots .replace(/\.([ ]+)(?=[.])/g, '.') // replaces three dots with ellipsis character .replace(/[ \t]*\.{3,}/g, '…') ; } function normalizeEllipsis (text) { return text // replaces more than one ellipsis with one .replace(/(…){2,}/g, '…') // replaces (space|tab|zwnj) after ellipsis with one space // NOTE: allows for space before ellipsis .replace(/([ ]{1,})*…[ \t\u200c]*/g, '$1… ') ; } function fixEnglishQuotesPairs (text) { return text // replaces english quote pairs with their persian equivalent .replace(/(“)(.+?)(”)/g, '«$2»') ; } // replaces english quote marks with their persian equivalent function fixEnglishQuotes (text) { return text .replace(/(["'`]+)(.+?)(\1)/g, '«$2»') ; } function fixHamzeh (text) { var replacement = '$1هٔ$3'; return text // replaces ه followed by (space|ZWNJ|lrm) follow by ی with هٔ .replace(/(\S)(ه[\s\u200c\u200e]+[یي])([\s\u200c\u200e])/g, replacement) // heh + ye // replaces ه followed by (space|ZWNJ|lrm|nothing) follow by ء with هٔ .replace(/(\S)(ه[\s\u200c\u200e]?\u0621)([\s\u200c\u200e])/g, replacement) // heh + standalone hamza // replaces هٓ or single-character ۀ with the standard هٔ // props @ebraminio/persiantools .replace(/(ۀ|هٓ)/g, 'هٔ') ; } function fixHamzehArabic (text) { return text // converts arabic hamzeh ة to هٔ .replace(/(\S)ة([\s\u200c\u200e])/g, '$1هٔ$2') ; } function fixHamzehArabicAlt (text) { return text // converts arabic hamzeh ة to هی .replace(/(\S)ة([\s\u200c\u200e])/g, '$1هی$2') ; } function cleanupRLM (text) { return text // converts Right-to-left marks followed by persian characters to // zero-width non-joiners (ZWNJ) .replace(/([^a-zA-Z\-_])(\u200F)/g, '$1\u200c') ; } // converts incorrect persian glyphs to standard characters function fixPersianGlyphs (text) { return arrReplace(text, glyphs); } // props @ebraminio/persiantools function fixMiscNonPersianChars (text) { return charReplace(text, 'كڪيىۍېہە', 'ککییییههه'); // return text // .replace(/ك/g, 'ک') // arabic kaf // .replace(/ڪ/g, 'ک') // arabic letter swash kaf // .replace(/ي/g, 'ی') // arabic // .replace(/ى/g, 'ی') // urdu // .replace(/ۍ/g, 'ی') // pushtu // .replace(/ې/g, 'ی') // uyghur // .replace(/ہ/g, 'ه') // converts ہ to ه ہہہہ to ههه // .replace(/[ەھ]/g, 'ه'); // kurdish } // replaces english numbers with their persian equivalent function fixEnglishNumbers (text) { return charReplace(text, '1234567890', digits); } // replaces arabic numbers with their persian equivalent function fixArabicNumbers (text) { return charReplace(text, '١٢٣٤٥٦٧٨٩٠', digits); } // @REF: https://github.com/shkarimpour/pholiday/pull/5/files function convertPersianNumbers (text) { return text.replace(/[\u0660-\u0669\u06f0-\u06f9]/g, function (char) { return char.charCodeAt(0) & 0xf; }); } function fixNumeralSymbols (text) { return text // replaces english percent signs (U+066A) // props @ebraminio/persiantools .replace(/([۰-۹]) ?%/g, '$1٪') // replaces dots between numbers into decimal separator (U+066B) // props @ebraminio/persiantools .replace(/([۰-۹])\.(?=[۰-۹])/g, '$1٫') // replaces commas between numbers into thousands separator (U+066C) // props @languagetool-org .replace(/([۰-۹]),(?=[۰-۹])/g, '$1٬') ; } function normalizeDates (text) { return text // re-orders date parts with slash as delimiter .replace(/([0-9۰-۹]{1,2})([/-])([0-9۰-۹]{1,2})\2([0-9۰-۹]{4})/g, function (matched, day, delimiter, month, year) { return year + '/' + month + '/' + day; }) ; } function fixPunctuations (text) { return charReplace(text, ',;', '،؛'); } // replaces question marks with its persian equivalent function fixQuestionMark (text) { return text .replace(/(\?)/g, '\u061F') // \u061F = ؟ ; } // puts zwnj between the word and the prefix: // - mi* nemi* bi* // NOTE: there's a possible bug here: prefixes could be separate nouns function fixPerfixSpacing (text) { var replacement = '$1\u200c$3'; return text .replace(/((\s|^)ن?می) ([^ ])/g, replacement) .replace(/((\s|^)بی) ([^ ])/g, replacement) // props @zoghal ; } // puts zwnj between the word and the suffix // NOTE: possible bug: suffixes could be nouns function fixSuffixSpacing (text) { var replacement = '$1\u200c$2'; return text // must done before others // *ha *haye .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) (ها(ی)?[' + patternAfter + '])'), replacement) // *am *at *ash *ei *eid *eem *and *man *tan *shan .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) ((ام|ات|اش|ای|اید|ایم|اند|مان|تان|شان)[' + patternAfter + '])'), replacement) // *tar *tari *tarin .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) (تر((ی)|(ین))?[' + patternAfter + '])'), replacement) // *hayee *hayam *hayat *hayash *hayetan *hayeman *hayeshan .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) ((هایی|هایم|هایت|هایش|هایمان|هایتان|هایشان)[' + patternAfter + '])'), replacement) ; } function fixSuffixSpacingHamzeh (text) { var replacement = '$1\u0647\u200c\u06cc$3'; return text // heh + ye .replace(/(\S)(ه[\s\u200c]+[یي])([\s\u200c])/g, replacement) // heh + standalone hamza .replace(/(\S)(ه[\s\u200c]?\u0621)([\s\u200c])/g, replacement) // heh + hamza above .replace(/(\S)(ه[\s\u200c]?\u0654)([\s\u200c])/g, replacement) ; } function fixSuffixMisc (text) { return text // replaces ه followed by ئ or ی, and then by ی, with ه\u200cای, // EXAMPLE: خانهئی becomes خانهای // props @ebraminio/persiantools .replace(/(\S)ه[\u200c\u200e][ئی]ی([\s\u200c\u200e])/g, '$1ه\u200cای$2') ; } function cleanupExtraMarks (text) { return text // removes space between different/same marks (combining for cleanup) .replace(/([؟?!])([ ]+)(?=[؟?!])/g, '$1') // replaces more than one exclamation mark with just one .replace(/(!){2,}/g, '$1') // replaces more than one english or persian question mark with just one .replace(/(\u061F|\?){2,}/g, '$1') // \u061F = `؟` // re-orders consecutive marks .replace(/(!)([ \t]*)([\u061F?])/g, '$3$1') // `?!` --> `!?` ; } // replaces kashidas to ndash in parenthetic function kashidasAsParenthetic (text) { return text .replace(/(\s)\u0640+/g, '$1–') .replace(/\u0640+(\s)/g, '–$1') ; } function cleanupKashidas (text) { return text // converts kashida between numbers to ndash .replace(/([0-9۰-۹]+)ـ+([0-9۰-۹]+)/g, '$1–$2') // removes all kashidas between non-whitespace characters // MAYBE: more punctuations .replace(/([^\s.])\u0640+(?![\s.])/g, '$1') ; } function fixPunctuationSpacing (text) { return text // removes space before punctuations .replace(/[ \t\u200c]*([:;,؛،.؟?!]{1})/g, '$1') // removes more than one space after punctuations // except followed by new-lines (or preservers) .replace(/([:;,؛،.؟?!]{1})[ \t\u200c]*(?!\n|_{2})/g, '$1 ') // removes space after colon that separates time parts .replace(/([0-9۰-۹]+):\s+([0-9۰-۹]+)/g, '$1:$2') // removes space after dots in numbers .replace(/([0-9۰-۹]+)\. ([0-9۰-۹]+)/g, '$1.$2') // removes space before common domain tlds .replace(/([\w\-_]+)\. (ir|com|org|net|info|edu|me)([\s/\\\])»:;.])/g, '$1.$2$3') // removes space between different/same marks (double-check) .replace(/([؟?!])([ ]+)(?=[؟?!])/g, '$1') ; } function fixBracesSpacing (text) { var replacement = ' $1$2$3 '; return text // removes inside spaces and more than one outside // for `()`, `[]`, `{}`, `“”` and `«»` .replace(/[ \t\u200c]*(\()\s*([^)]+?)\s*?(\))[ \t\u200c]*/g, replacement) .replace(/[ \t\u200c]*(\[)\s*([^\]]+?)\s*?(\])[ \t\u200c]*/g, replacement) .replace(/[ \t\u200c]*(\{)\s*([^}]+?)\s*?(\})[ \t\u200c]*/g, replacement) .replace(/[ \t\u200c]*(“)\s*([^”]+?)\s*?(”)[ \t\u200c]*/g, replacement) .replace(/[ \t\u200c]*(«)\s*([^»]+?)\s*?(»)[ \t\u200c]*/g, replacement) ; } function fixBracesSpacingInside (text) { var replacement = '$1$2$3'; return text // removes inside spaces for `()`, `[]`, `{}`, `“”` and `«»` .replace(/(\()\s*([^)]+?)\s*?(\))/g, replacement) .replace(/(\[)\s*([^\]]+?)\s*?(\])/g, replacement) .replace(/(\{)\s*([^}]+?)\s*?(\})/g, replacement) .replace(/(“)\s*([^”]+?)\s*?(”)/g, replacement) .replace(/(«)\s*([^»]+?)\s*?(»)/g, replacement) // NOTE: must be here, wierd not working if on `markdownNormalizeBraces()` // removes markdown link spaces inside normal () .replace(/(\(\[.*?\]\(.*?\))\s+(\))/g, '$1$2') ; } function markdownNormalizeBraces (text) { return text // removes space between ! and opening brace on markdown images // EXAMPLE: `! [alt] (src)` --> `` .replace(/! (\[.*?\])[ ]?(\(.*?\))[ ]?/g, '!$1$2') // removes spaces between [] and () // EXAMPLE: `[text] (link)` --> `[text](link)` .replace(/(\[.*?\])[ \t]+(\(.*?\))/g, '$1$2') // removes spaces inside double () [] {} // EXAMPLE: `[[ text ]]` --> `[[text]]` .replace(/\(\([ \t]*(.*?)[ \t]*\)\)/g, '(($1))') .replace(/\[\[[ \t]*(.*?)[ \t]*\]\]/g, '[[$1]]') .replace(/\{\{[ \t]*(.*?)[ \t]*\}\}/g, '{{$1}}') .replace(/\{\{\{[ \t]*(.*?)[ \t]*\}\}\}/g, '{{{$1}}}') // mustache escape // removes spaces between double () [] {} // EXAMPLE: `[[text] ]` --> `[[text]]` .replace(/(\(\(.*\))[ \t]+(\))/g, '$1$2') .replace(/(\[\[.*\])[ \t]+(\])/g, '$1$2') .replace(/(\{\{.*\})[ \t]+(\})/g, '$1$2') ; } function markdownNormalizeLists (text) { return text // removes extra line between two items list .replace(/((\n|^)\*.*?)\n+(?=\n\*)/g, '$1') .replace(/((\n|^)-.*?)\n+(?=\n-)/g, '$1') .replace(/((\n|^)#.*?)\n+(?=\n#)/g, '$1') ; } function fixMiscSpacing (text) { return text // removes space before parentheses on misc cases .replace(/ \((ص|عج|س|ع|ره)\)/g, '($1)') // removes space before braces containing numbers .replace(/ \[([0-9۰-۹]+)\]/g, '[$1]') ; } function fixDiacritics (text) { return text // cleans zwnj before diacritic characters .replace(newRegExp('\u200c([' + charsDiacritic + '])'), '$1') // cleans more than one diacritic characters // props @languagetool-org .replace(newRegExp('(.*)([' + charsDiacritic + ']){2,}(.*)'), '$1$2$3') // cleans spaces before diacritic characters .replace(newRegExp('(\\S)[ ]+([' + charsDiacritic + '])'), '$1$2') ; } function removeDiacritics (text) { return text // removes all diacritic characters .replace(newRegExp('[' + charsDiacritic + ']+'), '') ; } function cleanupSpacing (text) { return text // replaces more than one space with just a single one // except before/after preservers and before new-lines // .replace(/(?<![_]{2})([ ]{2,})(?![_]{2}|\n)/g, ' ') // WORKS: using lookbehind .replace(/([^_])([ ]{2,})(?![_]{2}|\n)/g, '$1 ') // cleans tab/space/zwnj/zwj/nbsp between two new-lines(\n) // @REF: https://stackoverflow.com/a/10965543/ .replace(/^\n([\t\u0020\u200c\u200d\u00a0]*)\n$/gm, '\n\n') } function cleanupLineBreaks (text) { return text // cleans more than two contiguous line-breaks .replace(/\n{2,}/g, '\n\n') ; } function cleanupBeginAndEnd (text) { return text // removes space/tab/zwnj/nbsp from the beginning of the new-lines .replace(/([\n]+)[ \t\u200c\u00a0]*/g, '$1') // removes spaces, tabs, zwnj, direction marks and new lines from // the beginning and end of text // @REF: http://stackoverflow.com/a/38490203 .replace(/^[\s\u200c\u200e\u200f]+|[\s\u200c\u200e\u200f]+$/g, '') ; } function flipPunctuations (text) { var end = ['-']; var start = ['!', '.', '،', '…', '"']; var before = []; var after = []; text = fixThreeDots(text); for (var iStart = 0; iStart < start.length; iStart++) { var sElement = start[iStart]; var sReg = newRegExp('^\\' + sElement, 'i'); if (sReg.test(text)) { text = text.replace(sReg, '').trim(); after.push(sElement); } } for (var iEnd = 0; iEnd < end.length; iEnd++) { var eElement = end[iEnd]; var eReg = newRegExp('\\' + eElement + '$', 'i'); if (eReg.test(text)) { text = text.replace(eReg, '').trim(); before.push(eElement); } } for (var iBefore = 0; iBefore < before.length; iBefore++) { text = before[iBefore] + ' ' + text; } for (var iAfter = 0; iAfter < after.length; iAfter++) { text += after[iAfter]; } return normalizeEllipsis(text); } // swap incorrect quotes pairs `»«` to `«»` and `”“` to `“”` function swapQuotes (text) { return text .replace(/(»)(.+?)(«)/g, '«$2»') .replace(/(”)(.+?)(“)/g, '“$2”') ; } Virastar.prototype = { // public methods defaults: defaults, cleanup: cleanup, // internal methods // cleanupZWNJ: cleanupZWNJ, // cleanupZWNJLate: cleanupZWNJLate, // decodeHTMLEntities: decodeHTMLEntities, // normalizeEOL: normalizeEOL, // fixDashes: fixDashes, // fixThreeDots: fixThreeDots, // normalizeEllipsis: normalizeEllipsis, // fixEnglishQuotesPairs: fixEnglishQuotesPairs, // fixEnglishQuotes: fixEnglishQuotes, // fixHamzeh: fixHamzeh, // fixHamzehArabic: fixHamzehArabic, // fixHamzehArabicAlt: fixHamzehArabicAlt, // cleanupRLM: cleanupRLM, // fixPersianGlyphs: fixPersianGlyphs, // fixMiscNonPersianChars: fixMiscNonPersianChars, // fixEnglishNumbers: fixEnglishNumbers, // fixArabicNumbers: fixArabicNumbers, // fixNumeralSymbols: fixNumeralSymbols, // fixPunctuations: fixPunctuations, // fixQuestionMark: fixQuestionMark, // fixPerfixSpacing: fixPerfixSpacing, // fixSuffixSpacing: fixSuffixSpacing, // fixSuffixSpacingHamzeh: fixSuffixSpacingHamzeh, // fixSuffixMisc: fixSuffixMisc, // cleanupExtraMarks: cleanupExtraMarks, // kashidasAsParenthetic: kashidasAsParenthetic, // cleanupKashidas: cleanupKashidas, // fixPunctuationSpacing: fixPunctuationSpacing, // fixBracesSpacing: fixBracesSpacing, // fixBracesSpacingInside: fixBracesSpacingInside, // markdownNormalizeBraces: markdownNormalizeBraces, // markdownNormalizeLists: markdownNormalizeLists, // fixDiacritics: fixDiacritics, // cleanupSpacing: cleanupSpacing, // cleanupLineBreaks: cleanupLineBreaks, // cleanupBeginAndEnd: cleanupBeginAndEnd, // extra methods convertPersianNumbers: convertPersianNumbers, flipPunctuations: flipPunctuations, swapQuotes: swapQuotes }; return Virastar; }));