A user first queries the database for the entries intended to be modified. These entries now have * an associated ID used by JLex for identifying rows in the database.

The entries are modified.

The user updates the database, in mode UPDATE_DATABASE, and supplies the modified xml. Update updating, * when JLex sees that an entry contains an ID, it knows to delete those entries in the database with that * id. The new entry is then entered into the database but keeps the same id.

* * @var boolean */ public $delete_entry; /** * An array containing all IDs created during the current update procedure. * @var array */ public $head_tag_ids; /** * This is a 2-dimensional array indexed by table name and containing rows for each table. * Rather than writing a row to the database after it is extracted from the XML document, * It is stored in this results array. Once the btye size of a set of rows reaches the value set in $max_file_size * the rows for a particular table are written to disk in the CSV file which will be bulk uploaded to MySQL * once all the data is extracted. * @var array */ public $results; /** * This tracks the number of entries extracted. * @var integer */ public $count; /** * An array indexed by table name containing the current size in bytes of all the rows for each table. This gets reset * to 0 after a set of results is written to disk. * @var array */ public $sql_file_sizes; /** * The maximum number of bytes that will be held in memory until the rows for a particular table are written to disk. * @var int. */ public $max_file_size = 10000000; /** * An array containing the number of bytes of the largest field in the XML document being uploaded. This is used for * defining the type of column associated with each field in a MySQL table. Another class, field_size_counter, * searches an XML document for the values inputted into this array. */ public $field_sizes; /**#@-*/ function __construct() { $this->head_tag = ""; $this->cur_group = null; $this->cur_tag = ""; $this->cur_data = ""; $this->template = ""; $this->get_entry = false; $this->delete_entry = false; $this->head_tag_ids = array(); $this->results = array(); $this->count = 0; $this->sql_file_size = array(); $this->field_sizes = array(); } /** * A function to remove html encodings of special characters. */ function unhtmlentities ($string) { $trans_tbl = get_html_translation_table (HTML_ENTITIES); $trans_tbl = array_flip ($trans_tbl); $ret = strtr ($string, $trans_tbl); return preg_replace('/\&\#([0-9]+)\;/me', "chr('\\1')",$ret); } /** * get_table_names returns an array containing all the tables in MySQL belonging to the specified project. * * A table is created for each group and subgroup of a particular project. The table name is designated by * the project_group. For example, each table for the nahuatl project will be nahuatl_groupname. The group class * provides a function which returns all the group names associated with a particular schema. get_table_names * calls this function to retrieve the group names. It then appends the project name to each group name and returns * an array containing these table names. * * @param string $project The project for which the list of tables is requested. * @param group $group The $group object representing the schema of a given project. * @return array An array of the tables associated with the specified project. */ function get_table_names($project,$group) { $group_names = $group->get_group_names(); $tables = array(); foreach($group_names as $name) { $tables[] = $project."_".$name; } return $tables; } /** * get_ids returns the next ID available to which a new row in the table can be assigned. * * This function is used when uploading in UPDATE_MODE. It returns an array containing the * next available ID for each table in a given project. In this case, the next available id is the * largest existing id + 1. Generally this function is called at the beginning of an update. The array of ids * is passed to the group object representing the schema of the given project. For a complete explanation of * this, please see the documentation for the group object. * * @param array $tables The tables for which IDs are needed. * @return array An associative array indexed by table name containing the next available id for that index. */ function get_ids($tables) { $ids = array(); foreach($tables as $name) { $i = strpos($name,"_") + 1; $group_name = substr($name,$i); $id_field = $group_name."_id"; $query = "SELECT max($id_field) FROM $name"; $query_result = mysql_query($query); $row = mysql_fetch_assoc($query_result); $ids[$group_name] = $row["max($id_field)"]+1; } return $ids; } /** * get_ids_fill returns the next available id for each table in the $tables argument. * * In this case, the next id is the first non-taken id beginning with 1. For example, ids * 1,2,4 may be taken. This function returns 3 as the next available id. Generally, this should not be * used when uploading more than one entry into the database. See get_ids for an explanation. * * @param array $tables The tables for which IDs are needed. * @return array An associative array indexed by table name containing the next available id for that index. */ function get_ids_fill($tables) { $ids = array(); foreach($tables as $name) { $id_field = $name."_id"; $query = "SELECT $id_field FROM $name ORDER BY $id_field"; $query_result = mysql_query($query); $count = 1; while($row = mysql_fetch_assoc($query_result)) { $id = $row[$id_field]; if($id != $count) { break; } $count++; } $ids[$name] = $count; } return $ids; } /** * create_sql_tables creates a table for each group contained in an entry within the xml document. * * This function creates sql tables for the specified project for each group within the xml document. * The function is recursive and works as follows: *

First, the type of ids for the table are determined. The head group will only have a main_id * assoiciated with it. All subgroups will have a parent_id, main_id and group id. The reasons for this * are explained in the group class documentation.
Next, if the current group is the head group, the xml document is searched for the largest size in bytes * of every field (including fields in subgroups). This information will be used to determine the type of column * begin created in the table. Note that even though this function is recursive, this search only takes place one: * when the current group is the head group. There is obviously no need to do it more than once.
A new column is then made for each field in the current group. If the maximum size for the given field * is found to be less than 50, then it''s type is set to 2 times its maximum size. This is purely hueristic and is * intended to save space in the database. Additionally, MySQL sets a limit on the number of text columns allowed * per table. This helps to ensure that limit is not reached. Of course, it is not a fullproof way of protecting * against this. A new table is then created in the MySQL database. If a table exists by the same name, it will * be deleted from the database and replaced by the new one.
Finally this function is called recursively for each subgroup belonging to the current group. A list of the * tables that have been created thus far are passed with each call to creat_sql_tables. If a table exists for the * current group, a new table will not be created. In other words, a project can not have multipe groups with the * same name. This aspect is also covered in the group class documentation.

* * @param string $project The project for which the new tables are being created for. * @param string $head_tag The main wrapper tag surrounding each entry in the XML document. * @param string $db_xml The path of the original XML document. * @param group $cur_group The group object for which the new database table will be created. * @param array $made_tables An array of the tables which have been created. This object is used to prevent the * same table from being created more than once. * @return array The made_tables array is returned after each function call. This array will be updated at the * end of a function call to include the table just created. I have chosen to make this object returned because * php4 doesn not allow static class variables. So it is the only way to pass dynamically created information * (the name of the new table created) to future recursive function calls. * */ function create_sql_tables($project, $head_tag, $db_xml, $cur_group, $made_tables) { $table_name = $project."_".$cur_group->name; $table = "CREATE TABLE $table_name \n("; if($cur_group->name != $head_tag) { $table .= $head_tag."_id int unsigned,\n"; $table .= "parent_id text,\n"; } $table .= $cur_group->name."_id int unsigned not null auto_increment primary key,\n"; if($cur_group->name == $head_tag) { $table .= "template text,\n"; if(file_exists($db_xml)) { $counter = new field_size_counter(); $this->field_sizes = $counter->get_field_sizes($db_xml); } else { $this->field_sizes = array(); } } foreach($cur_group->fields as $field=>$num) { $size = 1; if(array_key_exists($field,$this->field_sizes)) { $size = $this->field_sizes[$field]; if($size <= 50) { $n = 2*$size; $type = "varchar ($n)"; } else { $type = "text"; } } else { $type = "text"; } for($i=0;$i<$num;$i++) { $table .= $field."_$i $type,\n"; } } $table = substr(trim($table),0,-1).")"; mysql_query("DROP TABLE $table_name"); echo "table: $table
"; mysql_query($table); if(mysql_error()) { echo "method: create_sql_tables : ".mysql_error()."\n"; } $made_tables[] = $cur_group->name; foreach($cur_group->groups as $group) { if(!in_array($group->name,$made_tables)) { $subgroup_made_tables = $this->create_sql_tables($project,$head_tag,$db_xml,$group,$made_tables); foreach($subgroup_made_tables as $t) { if(!in_array($t, $made_tables)) { $made_tables[] = $t; } } } } return $made_tables; } /** * This functions handles the response when the beginning of a new tag is encountered during the parse. * * The following describes the inner workings of this function: *

If the element is a head group element, we are at the beginning of a new entry: *
- So all subgroups must be destroyed. This is vitally important in php4 because if all subgroups are * not explicitly destroyed, they do not get trash collected and very quickly, you get memory usage errors.
- Next, new ids must be assigned to the current group.
- The parent of the current group is set to false. The top level group obvioulsy has not * parent. Setting it the parent variable to false creates a method for testing whether you are at the root (if the * parent is false, you are at the root).
- The template is set to the empty string. Note that the template does not include the head group but * is assumed. See the group class documentation for an explanation.
- get_entry is set to true. This is a relic from an earlier way of extracting information from the xml * document. It is set to false at the end of an entry and set to true at the start tag introducing an entry.
*
If get_entry is true: *
- If the current tag is an id tag, we set delete_entry to true. See the explanation for the variable * delete_entry for an explanation.
- Otherwise, check to see if current element is one of the fields for the current group. *
  - If its not, it means that the tag marks the beginning of a new subgroup. We will create a new group, * provide it ids and place it in the group_values array of the current group. We will then reassign the * cur_group to this new child and continue with the parse.
  - If it is a field in the current group, we simply set the cur_tag to the element.
  *
*

*/ function startHandler($xp, $element, $attribs) { //echo "START: $element \n"; $element = strtolower($element); if($element == $this->head_tag) { $this->cur_group->destroy_subgroups(); $this->cur_group->assign_ids(Group::$ids[$this->head_tag],Group::$ids[$this->head_tag], Group::$ids[$this->head_tag]); $this->cur_group->parent = false; $this->template = ""; $this->get_entry = true; } else if($this->get_entry) { if($element == $this->cur_group->name."_id") { $this->delete_entry = true; $this->cur_tag = $element; } else { $this->template .= "$element "; if(!array_key_exists($element,$this->cur_group->fields)) { if(!array_key_exists($element,$this->cur_group->groups)) { echo "ERROR: $element not in ".$this->cur_group->name." line ".xml_get_current_line_number($this->parser)."
"; die("ERROR: XML does not match schema"); } //echo "moving down from ".$this->cur_group->name." to "; $new_group = clone $this->cur_group->groups[$element]; $new_group->assign_ids($this->cur_group->main_id, $this->cur_group->id, $this->cur_group->name); $this->cur_group->add_group_value($new_group); $this->cur_group = $new_group; unset($new_group); } else { $this->cur_tag = $element; } } } } /** * This functions handles the response for when an end tag is reached. * * The following describes the inner workings of this function: *

If the end tag equals the head tag, we are at the end of an entry in the xml document. We then: *
- Increment the counter.
- If the count is a multiple of 1000, we print a message saying so.
- We set the template of the current group to template.
- Add the current groups main id to head_tag_ids.
- Export the data in the current group into rows which are placed in the results array containing an * index for each table. If there are more than 20 rows, we write the rows to file. If the number of bytes * in the associated file for a given group is greater than the max file size, we bulk upload the file * into MySQL. The LOAD query depends on the server settings. Please see the MySQL documentation for an * explanation for how to use this function.
*
If the end tag is to the current group (though not the head tag), we have come to the end of a group. *
- Add a "/group_name" to the template. The slash identifies that this is the end of a group.
- The parent of the group just ended is set to the current group. In other words, we move down the tree * (towards the root).
*
If the first two conditions are not ture, the element must be the end of a field within the current group. *
- If the tag is an id tag, we do nothing with it. IDs are only set by the assign_ids function and are a meta- * field about an entry, not part of it. We do not want to include this as a field.
- Otherwise, take the values of cur_tag and cur_data and add a field to the current group.
*

*/ function endHandler($xp, $element) { //echo "END: $element \n"; $element = strtolower($element); if($this->head_tag == $element) { $this->count++; if(($this->count % 1000) == 0) { echo "Wrote entries ".($this->count-1000)." to ".$this->count."
"; } $this->cur_group->template = trim($this->template); $this->head_tag_ids[] = $this->cur_group->id; $result = $this->cur_group->to_ssv($this->head_tag); //echo $this->cur_group->print_group_info(); foreach($result as $table=>$rows) { foreach($rows as $row) { $this->results[$table][] = $row; } if(count($this->results[$table]) >= 20) { $s = ""; foreach($this->results[$table] as $row) { $s .= trim($row)."\n"; $this->sql_file_sizes[$this->project."_".$table] += (strlen($s) * 8); } $out = fopen($this->project."_".$table.".sql","a"); fwrite($out,$s); fclose($out); $cur_size = $this->sql_file_sizes[$this->project."_".$table]; if($cur_size >= $this->max_file_size) { $path = realpath($this->project."_".$table.".sql"); $path = str_replace("\\","\\\\",$path); $server = trim(getenv("SERVER_NAME")); if(($server == "127.0.0.1") || ($server == "")){ $query = "LOAD DATA"; } else { $query = "LOAD DATA LOCAL"; } $query .= " INFILE '$path' INTO TABLE ".$this->project."_".$table." FIELDS TERMINATED BY ' ' ENCLOSED BY '^'"; $query = $this->convert_to_insert_stmt($this->project."_".$table,$path); $query = utf8_decode($query); mysql_query($query); if(mysql_error()) { echo "method: endHandler : ".mysql_error()." : $table : ids through $this->count : filesize = $cur_size\n"; } unlink($path); $this->sql_file_sizes[$table] = 0; } unset($this->results[$table]); } } unset($result); $this->get_entry = false; } else if($element == $this->cur_group->name) { $this->template .= "/$element "; $child = &$this->cur_group; //echo "moving up from $child->name to "; unset($this->cur_group); $this->cur_group = &$child->parent; unset($child); //echo $this->cur_group->name."
"; } else if($element != $this->cur_group->name."_id"){ //echo "adding: $element = $this->cur_data \n"; $this->cur_group->add_value($this->cur_tag, $this->cur_data); $this->cur_tag = ""; $this->cur_data = ""; } } /** * Function for handling cDATA encountered when parsing the XML document. * * If delete_entry is true, the current tag must be an id tag. The current group''s id will be set to the * value of the current cDATA and the ids[group] will be decremented. When the group is initially created, * it's ids are assigned to it's respective groups. It is not until an id tag is parsed that we can know * the id originally assigned is not necessary. We still want to make this id available for later use and that is * why we decrement the ids[group_name] value. */ function cDataHandler($xp, $data) { $data = trim($data); if($this->delete_entry) { if($data != "") { $this->cur_group->id = $data; $this->cur_group->ids[$this->cur_group->name]--; if($this->cur_tag == $this->head_tag."_id") { $this->cur_group->main_id = $data; } $this->delete_entry($this->project."_".$this->cur_group->name,$this->cur_group->name."_id",$data); } $this->delete_entry = false; } else if($data != "") { $data = utf8_decode($data); $data = ereg_replace("&","&",$data); $data = ereg_replace("<","<",$data); $data = ereg_replace(">",">",$data); $this->cur_data .= $data; } } /** * parse begins the parsing of the specified xml document. * * This function can take it''s argument in two forms. First it checks if the file name has a '<' tag. If it does, * it is assumed that the xml is a string. If it does not, it treats db_xml as a file name and opens up the file * and begins parsing. Note that at the end of this function, it writes any remaining data in the $results variable * to the associated sql files. * * @todo It is not clear why the $results dumping should happen in this function. Consider another place for it. * @param string $db_xml Either an XML string or an xml file name. */ function parse($db_xml) { $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); if(strpos($db_xml,"<") === false) { if(file_exists($db_xml)) { $in = fopen($db_xml,"r"); $i = 0; while($line = fgets($in)) { $line = ereg_replace("&","&",$line); $good_parse = xml_parse($this->parser,$line,false); if(!$good_parse) { echo "there was an error
"; } set_time_limit(1000); } } } else { xml_parse($this->parser,$db_xml); } foreach($this->results as $table=>$rows) { foreach($rows as $row) { $s = ""; $s .= trim($row)."\n"; //$s = utf8_encode($s); $out = fopen($this->project."_".$table.".sql","a"); fwrite($out,$s); fclose($out); } } } /** * delete_entry deletes all rows in a given table where $field=$value * * @param string $table The table from which the entries will be deleted. * @param string $field The field which will be tested in search of entries to be deleted. * @param string $value All rows with in which $field contains specified value will be deleted. */ function delete_entry($table, $field, $value) { $query = "DELETE FROM $table WHERE $field='$value'"; @mysql_query($query); if(mysql_error()) { echo "method: delete_entry : mysql_error()
"; } } /** * This function will place the new value into the first column of the first table returned by the function * group->find_group. This function is meant to be used only for columns in which there exists a one column * by the given name contained in only one table. */ function update_column($id_col,$id,$col,$value) { $tables = $this->cur_group->find_group($col); $table = key($tables); $query = "UPDATE $table SET ".$col."_0=$value WHERE $id_col=$id"; return mysql_query($query); } /** * xml_to_db_bulk creates a new project and uploads an xml file into the MySQL database. * *

* This is the primary function to be called for uploading an xml file into the MySQL database. It is called * 'bulk' because is uses the SQL "LOAD DATA" function rather than doing update queries. As the XML file is * parsed, it is written to disk in a CSV like format. Once a sizable amount is written, the file is then uploaded * to the MySQL database using the "LOAD DATA" function. Consequently, depending on the size of the XML file * the "LOAD" function may be called multiple times. *

* Also note that a new table is created for each subgroup within an XML entry. Consider: * * VALUE * * VALUE * * * In this example, two tables will be created: one called 'project_head_tag', the other 'project_group1'. (Note * that all table names and field names are converted to lowercase.) *

* This function can be used both to create a new database as well as to update an existing one via the mode * paramater. *

If a new database is created, *
- A new set of ids is created: each table has a primary key called 'id'. The most recently used id * is kept in this associative array indexed by table name. *
- New tables are created using the create_new_tables method.
*
If a database is being updated: the most recent set of ids are retrieved from each * table associated with the project. No new tables are created. *

* The "LOAD" function is only called once the sql files reach a certain size. When the document is finished * being parsed, data may still exist in the sql files and must be inputted into the MySQL database. The last section * (beginning with the 'foreach' block) of code writes any data remaining in the sql files to the MySQL database. * A few things to note about this code. *

The sql files are written on the server. In order for MySQL to find these files, the absolute path must * be provided in the "LOAD" function call. *
If JLex is being run on a Windows server, the path will include backslashes, \. These need to be escaped * in order for MySQL to read these properly. Php recognizes the escaped backslash as the backslash character, * i.e. \\ for \. So the code to replace a single backslash with a double backslash will be \\\\ for a \\. *
If the server is being run locally, you use the "LOAD DATA" commmand. If not, use the "LOAD DATA LOCAL" * command. I do not fully understand why, but that is what I have found to be the case. *

* * @param string $project The name of the project being either created or updated. * @param string $schema The name of the schema file associated with this project. * @param string $db_xml The name of the XML file to be uploaded. * @param integer $mode The type of upload to be performed (either $CREATE_NEW_DATABASE or $UPDATE_DATABASE) * @return array The ids of the new rows created in the head_group table. */ function xml_to_db_bulk($project,$schema,$db_xml,$mode) { $this->project = $project; $sl = new schema_loader(); $this->cur_group = $sl->create_group_structure($schema, $db_xml); $this->head_tag = $this->cur_group->name; $tables = $this->get_table_names($project,$this->cur_group); $this->sql_file_sizes = array(); foreach($tables as $table) { $this->sql_file_sizes[$table] = 0; } if($mode == self::$CREATE_NEW_DATABASE) { $this->cur_group->init_ids(); $made_tables = array(); $this->create_sql_tables($project,$this->head_tag, $db_xml,$this->cur_group,$made_tables); } else { group::$ids = $this->get_ids($tables); } $this->parse($db_xml); /* foreach($tables as $table) { $path = realpath($table.".sql"); $path = str_replace("\\","\\\\",$path); $server = trim(getenv("SERVER_NAME")); if(file_exists($path)) { if(($server == "127.0.0.1") || ($server == "")) { $query = "LOAD DATA"; } else { $query = "LOAD DATA LOCAL"; } $query .= " INFILE '$path' INTO TABLE $table FIELDS TERMINATED BY ' ' ENCLOSED BY '^'"; $query = $this->convert_to_insert_stmt($table,$path); echo "query: $query
"; $query = utf8_decode($query); mysql_query($query); if(mysql_error()) { echo "method: xml_to_db_bulk : ".mysql_error()."\n"; } unlink($table.".sql"); } } */ xml_parser_free($this->parser); $this->cur_group->destroy_subgroups(); unset($this->cur_group); return $this->head_tag_ids; } /** * xml_to_insert inserts rows into relevant tables via the insert function. * * Originally, I had some problems getting the "LOAD" function to work properly. As a temporary * workaround, I wrote this function. However, I do not use this function anywhere in the JLex code. * This function is only used to update a project, not create a new one. * * @param string $project The name of the project being either created or updated. * @param string $schema The name of the schema file associated with this project. * @param string $db_xml The name of the XML file to be uploaded. * @return array The ids of the new rows created in the head_group table. */ function xml_to_insert($project,$schema,$db_xml) { $sl = new schema_loader(); $this->cur_group = $sl->create_group_structure($schema, $db_xml); $this->head_tag = $this->cur_group->name; $tables = $this->get_table_names($project,$this->cur_group); $this->sql_file_sizes = array(); $this->parse($db_xml); foreach($tables as $table) { $in = fopen($table.".sql","r"); while($line = fgets($in)) { $line = str_replace("^","'",$line); $line = str_replace(" ",",",$line); $line = "(".$line.")"; $query = "INSERT INTO $table () VALUES $line"; mysql_query($query); if(mysql_error()) { echo "method: xml_to_insert : ".mysql_error()."\n"; } } fclose($in); unlink($table.".sql"); } return $this->head_tag_ids; } function convert_to_insert_stmt($table, $file_name) { $insert_stmt = "INSERT INTO $table VALUES "; $in = fopen($file_name,"r"); while($line = fgets($in)) { $line = trim($line); $line = str_replace("\"","\\\"",$line); $line = str_replace("^ ^","\",\"",$line); $line = "\"".substr($line,1,-1)."\""; $insert_stmt .= "($line),"; } $insert_stmt = substr($insert_stmt,0,-1); return $insert_stmt; } function update_entry($project, $group) { $qo = new query_object(); $qo->set_project($project); $main_id = $group->main_id; $id_field = $group->name."_id"; $qo->add_condition_set($id_field,"field REGEXP '^~$'",$main_id); $converter = new mysql_to_xml(); $qo = $converter->query_database($qo); $ids = $qo->get_main_ids(); if(count($ids) == 1) { $id = $ids[0]; } else { die("Error: No such entry with $id_field = $main_id exists."); } if($update_type == $this->DELETE_REFS) { $group->group_vals["refset"] = array(); $group->template = "username "; } foreach($refs as $ref){ $g = clone $group->groups["refset"]; $g->add_value("ref",$ref); $group->group_vals["refset"][] = $g; $group->template .= "refset ref /refset "; unset($g); } $xml = "\n\n"; $template = explode(" ",trim($group->template)); $xml .= $group->to_xml_by_template($template,true); $xml .= ""; $dl = new db_loader(); $dl->xml_to_db_bulk("mydict","../mydict/schema.xml",$xml,$dl->UPDATE_DATABASE); unset($_SESSION["ref_ids"]); } } /* $dl = new db_loader(); $project = "test"; $schema = "nahuatl_full_schema.xml"; $db_xml = "test.xml"; $mode = $dl->CREATE_NEW_DATABASE; $dl->xml_to_db_bulk($project,$schema,$db_xml,$mode); */ ?>