00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 class TextStatistics {
00036
00037 protected $strEncoding = '';
00038
00045 public function __construct($strEncoding = '') {
00046 if ($strEncoding <> '') {
00047
00048 $this->strEncoding = $strEncoding;
00049 }
00050 }
00051
00056 function flesch_kincaid_reading_ease($strText) {
00057 $strText = $this->clean_text($strText);
00058 return round((206.835 - (1.015 * $this->average_words_per_sentence($strText)) - (84.6 * $this->average_syllables_per_word($strText))), 1);
00059 }
00060
00065 function flesch_kincaid_grade_level($strText) {
00066 $strText = $this->clean_text($strText);
00067 return round(((0.39 * $this->average_words_per_sentence($strText)) + (11.8 * $this->average_syllables_per_word($strText)) - 15.59), 1);
00068 }
00069
00074 public function gunning_fog_score($strText) {
00075 $strText = $this->clean_text($strText);
00076 return round((($this->average_words_per_sentence($strText) + $this->percentage_words_with_three_syllables($strText, false)) * 0.4), 1);
00077 }
00078
00083 public function coleman_liau_index($strText) {
00084 $strText = $this->clean_text($strText);
00085 return round( ( (5.89 * ($this->letter_count($strText) / $this->word_count($strText))) - (0.3 * ($this->sentence_count($strText) / $this->word_count($strText))) - 15.8 ), 1);
00086 }
00087
00092 public function smog_index($strText) {
00093 $strText = $this->clean_text($strText);
00094 return round(1.043 * sqrt(($this->words_with_three_syllables($strText) * (30 / $this->sentence_count($strText))) + 3.1291), 1);
00095 }
00096
00101 public function automated_readability_index($strText) {
00102 $strText = $this->clean_text($strText);
00103 return round(((4.71 * ($this->letter_count($strText) / $this->word_count($strText))) + (0.5 * ($this->word_count($strText) / $this->sentence_count($strText))) - 21.43), 1);
00104 }
00105
00110 public function text_length($strText) {
00111 $intTextLength = 0;
00112 try {
00113 if ($this->strEncoding == '') {
00114 $intTextLength = mb_strlen($strText);
00115 } else {
00116 $intTextLength = mb_strlen($strText, $this->strEncoding);
00117 }
00118 } catch (Exception $e) {
00119 $intTextLength = strlen($strText);
00120 }
00121 return $intTextLength;
00122 }
00123
00128 public function letter_count($strText) {
00129 $strText = $this->clean_text($strText);
00130 $intTextLength = 0;
00131 $strText = preg_replace('/[^A-Za-z]+/', '', $strText);
00132 try {
00133 if ($this->strEncoding == '') {
00134 $intTextLength = mb_strlen($strText);
00135 } else {
00136 $intTextLength = mb_strlen($strText, $this->strEncoding);
00137 }
00138 } catch (Exception $e) {
00139 $intTextLength = strlen($strText);
00140 }
00141 return $intTextLength;
00142 }
00143
00148 protected function clean_text($strText) {
00149
00150 $fullStopTags = array('li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd');
00151 foreach ($fullStopTags as $tag) {
00152 $strText = str_ireplace('</'.$tag.'>', '.', $strText);
00153 }
00154 $strText = strip_tags($strText);
00155 $strText = preg_replace('/[,:;()-]/', ' ', $strText);
00156 $strText = preg_replace('/[\.!?]/', '.', $strText);
00157 $strText = trim($strText) . '.';
00158 $strText = preg_replace('/[ ]*(\n|\r\n|\r)[ ]*/', ' ', $strText);
00159 $strText = preg_replace('/([\.])[\. ]+/', '$1', $strText);
00160 $strText = trim(preg_replace('/[ ]*([\.])/', '$1 ', $strText));
00161 $strText = preg_replace('/[ ]+/', ' ', $strText);
00162 $strText = preg_replace_callback('/\. [^ ]+/', create_function('$matches', 'return strtolower($matches[0]);'), $strText);
00163 return $strText;
00164 }
00165
00170 protected function lower_case($strText) {
00171 $strLowerCaseText = '';
00172 try {
00173 if ($this->strEncoding == '') {
00174 $strLowerCaseText = mb_strtolower($strText);
00175 } else {
00176 $strLowerCaseText = mb_strtolower($strText, $this->strEncoding);
00177 }
00178 } catch (Exception $e) {
00179 $strLowerCaseText = strtolower($strText);
00180 }
00181 return $strLowerCaseText;
00182 }
00183
00188 protected function upper_case($strText) {
00189 $strUpperCaseText = '';
00190 try {
00191 if ($this->strEncoding == '') {
00192 $strUpperCaseText = mb_strtoupper($strText);
00193 } else {
00194 $strUpperCaseText = mb_strtoupper($strText, $this->strEncoding);
00195 }
00196 } catch (Exception $e) {
00197 $strUpperCaseText = strtoupper($strText);
00198 }
00199 return $strUpperCaseText;
00200 }
00201
00208 protected function substring($strText, $intStart, $intLength) {
00209 $strSubstring = '';
00210 try {
00211 if ($this->strEncoding == '') {
00212 $strSubstring = mb_substr($strText, $intStart, $intLength);
00213 } else {
00214 $strSubstring = mb_substr($strText, $intStart, $intLength, $this->strEncoding);
00215 }
00216 } catch (Exception $e) {
00217 $strSubstring = substr($strText, $intStart, $intLength);
00218 }
00219 return $strSubstring;
00220 }
00221
00226 public function sentence_count($strText) {
00227 $strText = $this->clean_text($strText);
00228
00229 $intSentences = max(1, $this->text_length(preg_replace('/[^\.!?]/', '', $strText)));
00230 return $intSentences;
00231 }
00232
00237 public function word_count($strText) {
00238 $strText = $this->clean_text($strText);
00239
00240 $intWords = 1 + $this->text_length(preg_replace('/[^ ]/', '', $strText));
00241 return $intWords;
00242 }
00243
00248 public function average_words_per_sentence($strText) {
00249 $strText = $this->clean_text($strText);
00250 $intSentenceCount = $this->sentence_count($strText);
00251 $intWordCount = $this->word_count($strText);
00252 return ($intWordCount / $intSentenceCount);
00253 }
00254
00259 public function average_syllables_per_word($strText) {
00260 $strText = $this->clean_text($strText);
00261 $intSyllableCount = 0;
00262 $intWordCount = $this->word_count($strText);
00263 $arrWords = explode(' ', $strText);
00264 for ($i = 0; $i < $intWordCount; $i++) {
00265 $intSyllableCount += $this->syllable_count($arrWords[$i]);
00266 }
00267 return ($intSyllableCount / $intWordCount);
00268 }
00269
00275 public function words_with_three_syllables($strText, $blnCountProperNouns = true) {
00276 $strText = $this->clean_text($strText);
00277 $intLongWordCount = 0;
00278 $intWordCount = $this->word_count($strText);
00279 $arrWords = explode(' ', $strText);
00280 for ($i = 0; $i < $intWordCount; $i++) {
00281 if ($this->syllable_count($arrWords[$i]) > 2) {
00282 if ($blnCountProperNouns) {
00283 $intLongWordCount++;
00284 } else {
00285 $strFirstLetter = $this->substring($arrWords[$i], 0, 1);
00286 if ($strFirstLetter !== $this->upper_case($strFirstLetter)) {
00287
00288 $intLongWordCount++;
00289 }
00290 }
00291 }
00292 }
00293 return ($intLongWordCount);
00294 }
00295
00301 public function percentage_words_with_three_syllables($strText, $blnCountProperNouns = true) {
00302 $strText = $this->clean_text($strText);
00303 $intWordCount = $this->word_count($strText);
00304 $intLongWordCount = $this->words_with_three_syllables($strText, $blnCountProperNouns);
00305 $intPercentage = (($intLongWordCount / $intWordCount) * 100);
00306 return ($intPercentage);
00307 }
00308
00314 public function syllable_count($strWord) {
00315
00316 $intSyllableCount = 0;
00317 $strWord = $this->lower_case($strWord);
00318
00319
00320
00321 $arrProblemWords = Array(
00322 'simile' => 3
00323 ,'forever' => 3
00324 ,'shoreline' => 2
00325 );
00326 if (isset($arrProblemWords[$strWord])) {
00327 $intSyllableCount = $arrProblemWords[$strWord];
00328 }
00329 if ($intSyllableCount > 0) {
00330 return $intSyllableCount;
00331 }
00332
00333
00334 $arrSubSyllables = Array(
00335 'cial'
00336 ,'tia'
00337 ,'cius'
00338 ,'cious'
00339 ,'giu'
00340 ,'ion'
00341 ,'iou'
00342 ,'sia$'
00343 ,'[^aeiuoyt]{2,}ed$'
00344 ,'.ely$'
00345 ,'[cg]h?e[rsd]?$'
00346 ,'rved?$'
00347 ,'[aeiouy][dt]es?$'
00348 ,'[aeiouy][^aeiouydt]e[rsd]?$'
00349 ,'^[dr]e[aeiou][^aeiou]+$'
00350 ,'[aeiouy]rse$'
00351 );
00352
00353
00354 $arrAddSyllables = Array(
00355 'ia'
00356 ,'riet'
00357 ,'dien'
00358 ,'iu'
00359 ,'io'
00360 ,'ii'
00361 ,'[aeiouym]bl$'
00362 ,'[aeiou]{3}'
00363 ,'^mc'
00364 ,'ism$'
00365 ,'([^aeiouy])\1l$'
00366 ,'[^l]lien'
00367 ,'^coa[dglx].'
00368 ,'[^gq]ua[^auieo]'
00369 ,'dnt$'
00370 ,'uity$'
00371 ,'ie(r|st)$'
00372 );
00373
00374
00375 $arrPrefixSuffix = Array(
00376 '/^un/'
00377 ,'/^fore/'
00378 ,'/ly$/'
00379 ,'/less$/'
00380 ,'/ful$/'
00381 ,'/ers?$/'
00382 ,'/ings?$/'
00383 );
00384
00385
00386 $strWord = preg_replace($arrPrefixSuffix, '', $strWord, -1, $intPrefixSuffixCount);
00387
00388
00389 $strWord = preg_replace('/[^a-z]/is', '', $strWord);
00390 $arrWordParts = preg_split('/[^aeiouy]+/', $strWord);
00391 $intWordPartCount = 0;
00392 foreach ($arrWordParts as $strWordPart) {
00393 if ($strWordPart <> '') {
00394 $intWordPartCount++;
00395 }
00396 }
00397
00398
00399
00400 $intSyllableCount = $intWordPartCount + $intPrefixSuffixCount;
00401 foreach ($arrSubSyllables as $strSyllable) {
00402 $intSyllableCount -= preg_match('~' . $strSyllable . '~', $strWord);
00403 }
00404 foreach ($arrAddSyllables as $strSyllable) {
00405 $intSyllableCount += preg_match('~' . $strSyllable . '~', $strWord);
00406 }
00407 $intSyllableCount = ($intSyllableCount == 0) ? 1 : $intSyllableCount;
00408 return $intSyllableCount;
00409 }
00410
00411 }
00412
00413 ?>