Parse .csv file including quoted fields

Is there anyone that has already programmed in 4D .csv file parsing?
I have already programmed a method that takes a string and turns it to a string array using a separator character (comma or semicolon) but I didn’t take care of double-quotes in fields that may contain the separator character.
Just to avoid reinvent the wheel has anybody written the code to unquote the quoted fields?

Already found what I was looking for in old forum.
Here’s the link

https://forums.4d.com/Post/SP/15845771/2/16000282#15845772

Another approach, produces a collection:

1 Like

For reference:

Thanks, Keisuke, for sharing your code.

Kind regards!

Gerardo

The “good” link is not the first, but after. I paste the corresponding method here, easier translation I suppose… I use it intensively.
I’m very interested in Lutz version resulting in a collection, more “modern”: thank you, Lutz. I’d like to know if the result as 2d text array still remains interesting, depending on csv size and what we do with it after.

  //CSV_toArray2DwithRegex(txtCSV_t;tabTxt2D_p { ;sepChamp_t {;sepEnreg_t)
  //parse le texte CSV $1 dans un tableau texte 2 dimensions pointé par $2
  //en sortie ce tableau aura :
  //  autant de colonnes que de champs
  //  autant de lignes que d'enregistrements
  //$3 optionnel sepChamp_t "," par défaut
  //$4 optionnel sepEnreg_t "\n" par défaut
  //µ Arnaud * 02/10/2015 * debug
  //http://forums.4d.fr/Post/FR/11725374/0/0/#15938669
C_TEXT($0)
C_TEXT($1)
C_POINTER($2)
C_TEXT($3)
C_TEXT($4)

C_LONGINT($fld_l)
C_LONGINT($i_l)
C_LONGINT($maxFld_l)
C_LONGINT($params_l)
C_LONGINT($rec_l)
C_LONGINT($start_l)
C_POINTER($arr2Dtext_p)
C_TEXT($cell_t)
C_TEXT($error_t)
C_TEXT($fieldSep_t)
C_TEXT($group_t)
C_TEXT($nmc_t)
C_TEXT($pattern_t)
C_TEXT($recSep_t)
C_TEXT($txt_t)

If (False)
	C_TEXT(CSV_toArray2DwithRegex ;$1)
	C_POINTER(CSV_toArray2DwithRegex ;$2)
	C_TEXT(CSV_toArray2DwithRegex ;$3)
	C_TEXT(CSV_toArray2DwithRegex ;$4)
End if 
  //_
$nmc_t:=Current method name
$params_l:=Count parameters
Case of 
	: (Not(Asserted($params_l>1;$nmc_t+" 2 params expected")))
		$error_t:=$nmc_t+" 2 params expected"
	: (Length($1)=0)
		CLEAR VARIABLE($2->)
	: (Not(Asserted(Type($2->)=Array 2D;$nmc_t+" $2 pointer 2D text array expected")))
		$error_t:=$nmc_t+" $2 pointer 2D text array expected"
	: (Not(Asserted(Type($2->{0})=Text array;$nmc_t+" $2 pointer 2D text array expected")))
		$error_t:=$nmc_t+" $2 pointer 2D text array expected"
	Else 
		$txt_t:=$1
		$arr2Dtext_p:=$2
		$recSep_t:="\n"
		$fieldSep_t:=","
		If ($params_l>2)
			$fieldSep_t:=$3
			If ($params_l>3)
				$recSep_t:=$4
			End if 
		End if 
		
		CLEAR VARIABLE($arr2Dtext_p->)
		ARRAY TEXT($data_a2t;1;1)  //tableau local pour éviter dépointages
		
		  //la regex à momo qui tue
		$pattern_t:="(?m)("+$fieldSep_t+"|\\r?\\n|^)([^\""+$fieldSep_t+"\\r\\n]+|\"(?:[^\"]|\"\")*\"|)"
		ARRAY LONGINT($pos_al;0x0000)  //captures de la regex
		ARRAY LONGINT($len_al;0x0000)
		$start_l:=1
		$fld_l:=0
		$rec_l:=0
		$maxFld_l:=0
		
		While (Match regex($pattern_t;$txt_t;$start_l;$pos_al;$len_al))
			
			$patternResult_t:=Substring($txt_t;$pos_al{0};$len_al{0})
			$group_t:=Substring($txt_t;$pos_al{1};$len_al{1})  //groupe 1 = séparateur en début
			
			If ($group_t=$fieldSep_t)  //| ($group_t="")
				$fld_l:=$fld_l+1
			Else 
				$rec_l:=$rec_l+1
				$fld_l:=1
			End if 
			
			If (Size of array($data_a2t{1})<$rec_l)  //manque de lignes
				For ($i_l;1;Size of array($data_a2t))
					INSERT IN ARRAY($data_a2t{$i_l};$rec_l;5000)  //par paquets…
				End for 
			End if 
			
			If ($maxFld_l<$fld_l)  //manque de colonnes
				$maxFld_l:=$fld_l
				If (Size of array($data_a2t)<$maxFld_l)
					INSERT IN ARRAY($data_a2t;$fld_l;20)
				End if 
			End if 
			
			If ($len_al{2}>0)  //champ non vide
				If ($txt_t[[$pos_al{2}]]="\"")  //commence par "
					$group_t:=Substring($txt_t;$pos_al{2}+1;$len_al{2}-2)
					If (Position("\"";$group_t;1;*)>0)
						$group_t:=Replace string($group_t;"\"\"";"\"")  //double "" --> simple "
					End if 
					$cell_t:=$group_t
				Else 
					$cell_t:=Substring($txt_t;$pos_al{2};$len_al{2})
				End if 
				$data_a2t{$fld_l}{$rec_l}:=$cell_t
			End if 
			
			$start_l:=$pos_al{0}+$len_al{0}
		End while 
		
		If (Size of array($data_a2t)>$maxFld_l)  //trop de colonnes
			DELETE FROM ARRAY($data_a2t;$maxFld_l+1;Size of array($data_a2t)-$maxFld_l)
		End if 
		
		If (Size of array($data_a2t{1})>$rec_l)  //trop de lignes
			For ($i_l;1;Size of array($data_a2t))
				DELETE FROM ARRAY($data_a2t{$i_l};$rec_l+1;Size of array($data_a2t{$i_l})-$rec_l)
			End for 
		End if 
		
		  //copier dans tableau de retour
		INSERT IN ARRAY($arr2Dtext_p->;1;$maxFld_l)
		For ($i_l;1;Size of array($data_a2t))
			COPY ARRAY($data_a2t{$i_l};$arr2Dtext_p->{$i_l})
			CLEAR VARIABLE($data_a2t{$i_l})
		End for 
		
End case 
$0:=$error_t
  //_

1 Like

A direct extract using AJ_Tools_Regex is available on our repository on Github
See the example with AJ_Tools_RegexLab

1 Like