fields_to_strip = array(); $this->check_all_fields = false; $this->count = 0; $this->cur_data = ""; $this->symbols = array(); $this->substitutes = array(); $this->pattern = ""; $this->alpha_symbols = array(); $this->alpha_substitutes = array(); $this->alpha_first_chars = array(); $this->alpha_pattern = ""; $this->sort_fields = array(); $this->has_sort_fields = false; $this->alpha = ""; $this->cur_alpha_index = 10; $this->strippable_fields = array(); $this->fields = array(); $this->fields_to_strip = array(); $this->in_entry = false; } function get_sort_fields($sort_fields) { $fields = explode(" ",$sort_fields); foreach($fields as $field) { $this->sort_fields[] = $field; } } function get_fields_to_strip($fields) { $in = fopen($fields,"r"); while($line = fgets($in)) { $field = trim($line); $fields_to_strip[] = $field; } return $fields_to_strip; } function make_alpha_symbols_table($textarea) { $textarea = stripslashes($textarea); $textarea = trim(utf8_decode($textarea)); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->alpha_symbols[] = $symbol; $this->alpha_substitutes[] = $substitute; } } $symbols_string = implode("",$this->alpha_symbols); if(substr($symbols_string,-1,1) == "'") { $symbols_string = "'".substr($symbols_string,0,-1); } if($symbols_string != "") { $this->alpha_pattern = "[".$symbols_string."]"; } } function make_alpha_first_chars_table($textarea) { $textarea = trim(stripslashes($textarea)); $textarea = utf8_decode($textarea); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->alpha_first_chars[$symbol] = $substitute; } } $first_chars_string = trim(implode("",array_keys($this->alpha_first_chars))); if(substr($first_chars_string,-1,1) == "'") { $first_chars_string = "'".substr($first_chars_string,0,-1); } if($first_chars_string != "") { if($this->alpha_pattern != "") { $this->alpha_pattern .= "|"; } $this->alpha_pattern .= "^[".$first_chars_string."]"; } } function make_symbols_table($textarea) { $textarea = stripslashes($textarea); $textarea = trim(utf8_decode($textarea)); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->symbols[] = $symbol; $this->substitutes[] = $substitute; } } $symbols_string = implode("",$this->symbols); if(substr($symbols_string,-1,1) == "'") { $symbols_string = "'".substr($symbols_string,0,-1); } if($symbols_string != "") { $this->pattern = "[".$symbols_string."]"; } } function make_first_chars_table($textarea) { $textarea = trim(stripslashes($textarea)); $textarea = utf8_decode($textarea); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->first_chars[$symbol] = $substitute; } } $first_chars_string = trim(implode("",array_keys($this->first_chars))); if(substr($first_chars_string,-1,1) == "'") { $first_chars_string = "'".substr($first_chars_string,0,-1); } if($first_chars_string != "") { if($this->pattern != "") { $this->pattern .= "|"; } $this->pattern .= "^[".$first_chars_string."]"; } } function alpha_strip($value) { $stripped = str_replace($this->alpha_symbols,$this->alpha_substitutes,$value); $first_char = substr($stripped,0,1); while(@array_key_exists($first_char,$this->alpha_first_chars)) { $stripped = $this->alpha_first_chars[$first_char].substr($stripped,1); $first_char = substr($stripped,0,1); } return $stripped; } function strip($value) { $stripped = str_replace($this->symbols,$this->substitutes,$value); $first_char = substr($stripped,0,1); while(@array_key_exists($first_char,$this->first_chars)) { $stripped = $this->first_chars[$first_char].substr($stripped,1); $first_char = substr($stripped,0,1); } return $stripped; } function startHandler($xp, $element, $attribs) { $element = strtolower($element); if($element == $this->head_tag) { $this->in_entry = true; } if($this->produce_stripped_version) { if($this->in_entry) { if(in_array($element,$this->fields)) { fwrite($this->out,"<$element>"); } } else { fwrite($this->out,"<$element>\n"); } } } function endHandler($xp, $element) { $element = strtolower($element); if($this->produce_stripped_version) { if($this->has_sort_fields && in_array($element, $this->sort_fields)) { $index = array_search($element,$this->sort_fields); if($index < $this->cur_alpha_index) { $this->alpha = $this->alpha_strip($this->cur_data); $this->cur_alpha_index = $index; } } if(($element == $this->head_tag) && $this->has_sort_fields) { fwrite($this->out,"$this->alpha\n"); $this->alpha = ""; $this->cur_alpha_index = count($this->sort_fields); } if($this->in_entry) { if(in_array($element,$this->fields)) { fwrite($this->out,"$this->cur_data\n"); } } else { fwrite($this->out,"$this->cur_data\n"); } } else { if($this->in_entry) { if(!in_array($element,$this->fields)) { $this->fields[] = $element; } } } if($this->pattern != "") { if(ereg($this->pattern,$this->cur_data)) { if($this->produce_stripped_version) { if(in_array($element,$this->fields_to_strip)) { $stripped_value = $this->strip($this->cur_data); fwrite($this->out,"<".$element."_s>$stripped_value\n"); } } else { if(!in_array($element,$this->strippable_fields)) { $this->strippable_fields[] = $element; } } } } if($element == $this->head_tag) { $this->in_entry = false; } $this->cur_data = ""; } function cDataHandler($xp, $data) { $data = trim($data); if($data != "") { $data = ereg_replace("&","&",$data); $data = ereg_replace("<","<",$data); $data = ereg_replace(">",">",$data); $this->cur_data .= $data; } } function unhtmlentities ($string) { $trans_tbl = get_html_translation_table (HTML_ENTITIES); $trans_tbl = array_flip ($trans_tbl); $ret = strtr ($string, $trans_tbl); return preg_replace('/\&\#([0-9]+)\;/me', "chr('\\1')",$ret); } function get_fields($xml,$head_tag,$symbols,$first_chars) { $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); $this->head_tag = $head_tag; if(trim($symbols) != "") { $this->make_symbols_table($symbols); } if(trim($first_chars) != "") { $this->make_first_chars_table($first_chars); } $this->produce_stripped_version = false; $in = fopen($xml,"r"); while($line = fgets($in)) { xml_parse($this->parser,$line,false); } fclose($in); sort($this->strippable_fields); sort($this->fields); } function modify($xml,$new_xml_name, $head_tag, $sort_fields, $alpha_symbols, $alpha_first_chars, $symbols, $first_chars, $fields, $fields_to_strip) { $this->produce_stripped_version = true; $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); $this->fields = $fields; $this->fields_to_strip = $fields_to_strip; $this->head_tag = $head_tag; if($sort_fields != "") { $this->get_sort_fields($sort_fields); $this->has_sort_fields = true; $this->fields[] = "alpha"; } else { $this->has_sort_fields = false; } if(trim($symbols) != "") { $this->make_symbols_table($symbols); } if(trim($first_chars) != "") { $this->make_first_chars_table($first_chars); } if(trim($alpha_symbols) != "") { $this->make_alpha_symbols_table($alpha_symbols); } if(trim($alpha_first_chars) != "") { $this->make_alpha_first_chars_table($alpha_first_chars); } $this->out = fopen($new_xml_name,"w"); fwrite($this->out,"\n"); $in = fopen($xml,"r"); while($line = fgets($in)) { xml_parse($this->parser,$line,false); } fclose($in); fclose($this->out); xml_parser_free($this->parser); } } /* //NOTE: you must remove utf8_encode from the make_symbol_table functions when testing from the command line set_time_limit(0); $converter = new db_modifier(); $sort_fields = "lxam lxoa"; $symbols = "á = a\né = e\ní = i\nó = o\nú = u\nÁ = A\nÉ = E\nÍ = I\nÓ = O\nÚ = U\nÑ = N\nñ = n\nü = u"; $first_chars = "-' = ''"; $alpha_symbols = "á = a"; $alpha_first_chars = "-' = ''"; $fields_not_to_strip = array(); $converter->get_fields("test.xml","refgroup",$symbols,$first_chars); print_r($converter->fields); $fields_to_strip = array_diff($converter->strippable_fields,$fields_not_to_strip); $converter->modify("test.xml","with_stripped.xml","refgroup", $sort_fields,$alpha_symbols,$alpha_first_chars,$symbols,$first_chars, $converter->fields,$fields_to_strip); */ ?>